Skip to content

Commit 4b2c5e3

Browse files
committed
Address comments.
1 parent 6a9c5e2 commit 4b2c5e3

File tree

11 files changed

+68
-62
lines changed

11 files changed

+68
-62
lines changed

src/sagemaker/amazon/amazon_estimator.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,21 +31,17 @@ class AmazonAlgorithmEstimatorBase(EstimatorBase):
3131
feature_dim = hp('feature_dim', (validation.isint, validation.gt(0)))
3232
mini_batch_size = hp('mini_batch_size', (validation.isint, validation.gt(0)))
3333

34-
def __init__(self, role, train_instance_count, train_instance_type,
35-
default_mini_batch_size=None, data_location=None, **kwargs):
34+
def __init__(self, role, train_instance_count, train_instance_type, data_location=None, **kwargs):
3635
"""Initialize an AmazonAlgorithmEstimatorBase.
3736
3837
Args:
39-
default_mini_batch_size (int): Default size of mini-batch used for training set for algorithms that
40-
require this parameter.
4138
data_location (str or None): The s3 prefix to upload RecordSet objects to, expressed as an
4239
S3 url. For example "s3://example-bucket/some-key-prefix/". Objects will be
4340
saved in a unique sub-directory of the specified location. If None, a default
4441
data location will be used."""
4542
super(AmazonAlgorithmEstimatorBase, self).__init__(role, train_instance_count, train_instance_type,
4643
**kwargs)
4744

48-
self.default_mini_batch_size = default_mini_batch_size
4945
default_location = "s3://{}/sagemaker-record-sets/".format(self.sagemaker_session.default_bucket())
5046
data_location = data_location or default_location
5147
self.data_location = data_location
@@ -89,8 +85,8 @@ def fit(self, records, mini_batch_size=None, **kwargs):
8985
mini_batch_size (int or None): The size of each mini-batch to use when training. If None, a
9086
default value will be used.
9187
"""
92-
self.mini_batch_size = mini_batch_size or self.default_mini_batch_size
9388
self.feature_dim = records.feature_dim
89+
self.mini_batch_size = mini_batch_size
9490

9591
data = {records.channel: s3_input(records.s3_data, distribution='ShardedByS3Key',
9692
s3_data_type=records.s3_data_type)}

src/sagemaker/amazon/factorization_machines.py

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from sagemaker.amazon.amazon_estimator import AmazonAlgorithmEstimatorBase, registry
1414
from sagemaker.amazon.common import numpy_to_record_serializer, record_deserializer
1515
from sagemaker.amazon.hyperparameter import Hyperparameter as hp # noqa
16-
from sagemaker.amazon.validation import gt, isin, isint, ge, isfloat
16+
from sagemaker.amazon.validation import gt, isin, isint, ge, isnumber
1717
from sagemaker.predictor import RealTimePredictor
1818
from sagemaker.model import Model
1919
from sagemaker.session import Session
@@ -27,30 +27,30 @@ class FactorizationMachines(AmazonAlgorithmEstimatorBase):
2727
predictor_type = hp('predictor_type', isin('binary_classifier', 'regressor'),
2828
'Value "binary_classifier" or "regressor"')
2929
epochs = hp('epochs', (gt(0), isint), "An integer greater than 0")
30-
clip_gradient = hp('clip_gradient', isfloat, "A float value")
31-
eps = hp('eps', isfloat, "A float value")
32-
rescale_grad = hp('rescale_grad', isfloat, "A float value")
33-
bias_lr = hp('bias_lr', (ge(0), isfloat), "A non-negative float")
34-
linear_lr = hp('linear_lr', (ge(0), isfloat), "A non-negative float")
35-
factors_lr = hp('factors_lr', (ge(0), isfloat), "A non-negative float")
36-
bias_wd = hp('bias_wd', (ge(0), isfloat), "A non-negative float")
37-
linear_wd = hp('linear_wd', (ge(0), isfloat), "A non-negative float")
38-
factors_wd = hp('factors_wd', (ge(0), isfloat), "A non-negative float")
30+
clip_gradient = hp('clip_gradient', isnumber, "A float value")
31+
eps = hp('eps', isnumber, "A float value")
32+
rescale_grad = hp('rescale_grad', isnumber, "A float value")
33+
bias_lr = hp('bias_lr', (ge(0), isnumber), "A non-negative float")
34+
linear_lr = hp('linear_lr', (ge(0), isnumber), "A non-negative float")
35+
factors_lr = hp('factors_lr', (ge(0), isnumber), "A non-negative float")
36+
bias_wd = hp('bias_wd', (ge(0), isnumber), "A non-negative float")
37+
linear_wd = hp('linear_wd', (ge(0), isnumber), "A non-negative float")
38+
factors_wd = hp('factors_wd', (ge(0), isnumber), "A non-negative float")
3939
bias_init_method = hp('bias_init_method', isin('normal', 'uniform', 'constant'),
4040
'Value "normal", "uniform" or "constant"')
41-
bias_init_scale = hp('bias_init_scale', (ge(0), isfloat), "A non-negative float")
42-
bias_init_sigma = hp('bias_init_sigma', (ge(0), isfloat), "A non-negative float")
43-
bias_init_value = hp('bias_init_value', isfloat, "A float value")
41+
bias_init_scale = hp('bias_init_scale', (ge(0), isnumber), "A non-negative float")
42+
bias_init_sigma = hp('bias_init_sigma', (ge(0), isnumber), "A non-negative float")
43+
bias_init_value = hp('bias_init_value', isnumber, "A float value")
4444
linear_init_method = hp('linear_init_method', isin('normal', 'uniform', 'constant'),
4545
'Value "normal", "uniform" or "constant"')
46-
linear_init_scale = hp('linear_init_scale', (ge(0), isfloat), "A non-negative float")
47-
linear_init_sigma = hp('linear_init_sigma', (ge(0), isfloat), "A non-negative float")
48-
linear_init_value = hp('linear_init_value', isfloat, "A float value")
46+
linear_init_scale = hp('linear_init_scale', (ge(0), isnumber), "A non-negative float")
47+
linear_init_sigma = hp('linear_init_sigma', (ge(0), isnumber), "A non-negative float")
48+
linear_init_value = hp('linear_init_value', isnumber, "A float value")
4949
factors_init_method = hp('factors_init_method', isin('normal', 'uniform', 'constant'),
5050
'Value "normal", "uniform" or "constant"')
51-
factors_init_scale = hp('factors_init_scale', (ge(0), isfloat), "A non-negative float")
52-
factors_init_sigma = hp('factors_init_sigma', (ge(0), isfloat), "A non-negative float")
53-
factors_init_value = hp('factors_init_value', isfloat, "A float value")
51+
factors_init_scale = hp('factors_init_scale', (ge(0), isnumber), "A non-negative float")
52+
factors_init_sigma = hp('factors_init_sigma', (ge(0), isnumber), "A non-negative float")
53+
factors_init_value = hp('factors_init_value', isnumber, "A float value")
5454

5555
def __init__(self, role, train_instance_count, train_instance_type,
5656
num_factors, predictor_type,

src/sagemaker/amazon/kmeans.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ class KMeans(AmazonAlgorithmEstimatorBase):
3333
epochs = hp('epochs', (gt(0), isint), 'An integer greater-than 0')
3434
center_factor = hp('extra_center_factor', (gt(0), isint), 'An integer greater-than 0')
3535

36-
def __init__(self, role, train_instance_count, train_instance_type, k, default_mini_batch_size=5000,
37-
init_method=None, max_iterations=None, tol=None, num_trials=None, local_init_method=None,
36+
def __init__(self, role, train_instance_count, train_instance_type, k, init_method=None,
37+
max_iterations=None, tol=None, num_trials=None, local_init_method=None,
3838
half_life_time_size=None, epochs=None, center_factor=None, **kwargs):
3939
"""
4040
A k-means clustering :class:`~sagemaker.amazon.AmazonAlgorithmEstimatorBase`. Finds k clusters of data in an
@@ -67,7 +67,6 @@ def __init__(self, role, train_instance_count, train_instance_type, k, default_m
6767
train_instance_count (int): Number of Amazon EC2 instances to use for training.
6868
train_instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
6969
k (int): The number of clusters to produce.
70-
default_mini_batch_size (int): Default size of mini-batch used for training.
7170
init_method (str): How to initialize cluster locations. One of 'random' or 'kmeans++'.
7271
max_iterations (int): Maximum iterations for Lloyds EM procedure in the local kmeans used in finalize stage.
7372
tol (int): Tolerance for change in ssd for early stopping in local kmeans.
@@ -84,8 +83,7 @@ def __init__(self, role, train_instance_count, train_instance_type, k, default_m
8483
reduce the number of centers to ``k`` when finalizing
8584
**kwargs: base class keyword argument values.
8685
"""
87-
super(KMeans, self).__init__(role, train_instance_count, train_instance_type,
88-
default_mini_batch_size, **kwargs)
86+
super(KMeans, self).__init__(role, train_instance_count, train_instance_type, **kwargs)
8987
self.k = k
9088
self.init_method = init_method
9189
self.max_iterations = max_iterations
@@ -101,6 +99,9 @@ def create_model(self):
10199
s3 model data produced by this Estimator."""
102100
return KMeansModel(self.model_data, self.role, self.sagemaker_session)
103101

102+
def fit(self, records, mini_batch_size=5000, **kwargs):
103+
super(KMeans, self).fit(records, mini_batch_size, **kwargs)
104+
104105
def hyperparameters(self):
105106
"""Return the SageMaker hyperparameters for training this KMeans Estimator"""
106107
hp = dict(force_dense='True') # KMeans requires this hp to fit on Record objects

src/sagemaker/amazon/linear_learner.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ class LinearLearner(AmazonAlgorithmEstimatorBase):
2323

2424
repo = 'linear-learner:1'
2525

26+
DEFAULT_MINI_BATCH_SIZE = 1000
27+
2628
binary_classifier_model_selection_criteria = hp('binary_classifier_model_selection_criteria',
2729
isin('accuracy', 'f1', 'precision_at_target_recall',
2830
'recall_at_target_precision', 'cross_entropy_loss'))
@@ -60,7 +62,7 @@ class LinearLearner(AmazonAlgorithmEstimatorBase):
6062
unbias_label = hp('unbias_label', isbool, 'A boolean')
6163
num_point_for_scalar = hp('num_point_for_scalar', (isint, gt(0)), 'An integer greater-than 0')
6264

63-
def __init__(self, role, train_instance_count, train_instance_type, predictor_type,
65+
def __init__(self, role, train_instance_count, train_instance_type, predictor_type='binary_classifier',
6466
binary_classifier_model_selection_criteria=None, target_recall=None, target_precision=None,
6567
positive_example_weight_mult=None, epochs=None, use_bias=None, num_models=None,
6668
num_calibration_samples=None, init_method=None, init_scale=None, init_sigma=None, init_bias=None,
@@ -191,6 +193,13 @@ def create_model(self):
191193

192194
return LinearLearnerModel(self, self.model_data, self.role, self.sagemaker_session)
193195

196+
def fit(self, records, mini_batch_size=None, **kwargs):
197+
# mini_batch_size can't be greater than number of records or training job fails
198+
default_mini_batch_size = min(self.DEFAULT_MINI_BATCH_SIZE,
199+
max(1, int(records.num_records / self.train_instance_count)))
200+
use_mini_batch_size = mini_batch_size or default_mini_batch_size
201+
super(LinearLearner, self).fit(records, use_mini_batch_size, **kwargs)
202+
194203

195204
class LinearLearnerPredictor(RealTimePredictor):
196205
"""Performs binary-classification or regression prediction from input vectors.

src/sagemaker/amazon/pca.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ class PCA(AmazonAlgorithmEstimatorBase):
2222

2323
repo = 'pca:1'
2424

25+
DEFAULT_MINI_BATCH_SIZE = 500
26+
2527
num_components = hp(name='num_components', validate=lambda x: x > 0 and isinstance(x, int),
2628
validation_message='Value must be an integer greater than zero')
2729
algorithm_mode = hp(name='algorithm_mode', validate=lambda x: x in ['regular', 'stable', 'randomized'],
@@ -31,7 +33,7 @@ class PCA(AmazonAlgorithmEstimatorBase):
3133
extra_components = hp(name='extra_components', validate=lambda x: x >= 0 and isinstance(x, int),
3234
validation_message="Value must be an integer greater than or equal to 0")
3335

34-
def __init__(self, role, train_instance_count, train_instance_type, num_components, default_mini_batch_size,
36+
def __init__(self, role, train_instance_count, train_instance_type, num_components,
3537
algorithm_mode=None, subtract_mean=None, extra_components=None, **kwargs):
3638
"""A Principal Components Analysis (PCA) :class:`~sagemaker.amazon.amazon_estimator.AmazonAlgorithmEstimatorBase`.
3739
@@ -66,7 +68,6 @@ def __init__(self, role, train_instance_count, train_instance_type, num_componen
6668
train_instance_count (int): Number of Amazon EC2 instances to use for training.
6769
train_instance_type (str): Type of EC2 instance to use for training, for example, 'ml.c4.xlarge'.
6870
num_components(int): The number of principal components. Must be greater than zero.
69-
default_mini_batch_size (int): Default size of mini-batch used for training.
7071
algorithm_mode (str): Mode for computing the principal components. One of 'regular', 'stable' or
7172
'randomized'.
7273
subtract_mean (bool): Whether the data should be unbiased both during train and at inference.
@@ -75,8 +76,7 @@ def __init__(self, role, train_instance_count, train_instance_type, num_componen
7576
to the maximum of 10 and num_components will be used. Valid for randomized mode only.
7677
**kwargs: base class keyword argument values.
7778
"""
78-
super(PCA, self).__init__(role, train_instance_count, train_instance_type,
79-
default_mini_batch_size, **kwargs)
79+
super(PCA, self).__init__(role, train_instance_count, train_instance_type, **kwargs)
8080
self.num_components = num_components
8181
self.algorithm_mode = algorithm_mode
8282
self.subtract_mean = subtract_mean
@@ -88,6 +88,13 @@ def create_model(self):
8888

8989
return PCAModel(self.model_data, self.role, sagemaker_session=self.sagemaker_session)
9090

91+
def fit(self, records, mini_batch_size=None, **kwargs):
92+
# mini_batch_size is a required parameter
93+
default_mini_batch_size = min(self.DEFAULT_MINI_BATCH_SIZE,
94+
max(1, int(records.num_records / self.train_instance_count)))
95+
use_mini_batch_size = mini_batch_size or default_mini_batch_size
96+
super(PCA, self).fit(records, use_mini_batch_size, **kwargs)
97+
9198

9299
class PCAPredictor(RealTimePredictor):
93100
"""Transforms input vectors to lower-dimesional representations.

src/sagemaker/amazon/validation.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,4 +46,3 @@ def validate(value):
4646
isint = istype(int)
4747
isbool = istype(bool)
4848
isnumber = istype(numbers.Number) # noqa
49-
isfloat = istype(float)

tests/integ/test_factorization_machines.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ def test_factorization_machines():
3838
fm = FactorizationMachines(role='SageMakerRole', train_instance_count=1,
3939
train_instance_type='ml.c4.xlarge',
4040
num_factors=10, predictor_type='regressor',
41-
default_mini_batch_size=100,
4241
epochs=2, clip_gradient=1e2, eps=0.001, rescale_grad=1.0/100,
4342
sagemaker_session=sagemaker_session, base_job_name='test-fm')
4443

tests/integ/test_linear_learner.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ def test_linear_learner():
3939
train_set[1][100:200] = 0
4040
train_set = train_set[0], train_set[1].astype(np.dtype('float32'))
4141

42-
ll = LinearLearner('SageMakerRole', 1, 'ml.c4.2xlarge', predictor_type='binary_classifier',
43-
base_job_name='test-linear-learner', sagemaker_session=sagemaker_session)
42+
ll = LinearLearner('SageMakerRole', 1, 'ml.c4.2xlarge', base_job_name='test-linear-learner',
43+
sagemaker_session=sagemaker_session)
4444
ll.binary_classifier_model_selection_criteria = 'accuracy'
4545
ll.target_reacall = 0.5
4646
ll.target_precision = 0.5
@@ -71,7 +71,7 @@ def test_linear_learner():
7171
ll.unbias_data = True
7272
ll.unbias_label = False
7373
ll.num_point_for_scala = 10000
74-
ll.fit(ll.record_set(train_set[0][:200], train_set[1][:200]), mini_batch_size=100)
74+
ll.fit(ll.record_set(train_set[0][:200], train_set[1][:200]))
7575

7676
endpoint_name = name_from_base('linear-learner')
7777
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20):

tests/integ/test_pca.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def test_pca():
3535
train_set, _, _ = pickle.load(f, **pickle_args)
3636

3737
pca = sagemaker.amazon.pca.PCA(role='SageMakerRole', train_instance_count=1,
38-
train_instance_type='ml.m4.xlarge', default_mini_batch_size=500,
38+
train_instance_type='ml.m4.xlarge',
3939
num_components=48, sagemaker_session=sagemaker_session, base_job_name='test-pca')
4040

4141
pca.algorithm_mode = 'randomized'

0 commit comments

Comments
 (0)