@@ -100,6 +100,7 @@ class EstimatorBase(with_metaclass(ABCMeta, object)): # pylint: disable=too-man
100
100
instance.
101
101
"""
102
102
103
+ LAUNCH_PT_XLA_ENV_NAME = "sagemaker_pytorch_xla_multi_worker_enabled"
103
104
LAUNCH_PS_ENV_NAME = "sagemaker_parameter_server_enabled"
104
105
LAUNCH_MPI_ENV_NAME = "sagemaker_mpi_enabled"
105
106
LAUNCH_SM_DDP_ENV_NAME = "sagemaker_distributed_dataparallel_enabled"
@@ -166,10 +167,44 @@ def __init__(
166
167
instance_type (str): Type of EC2 instance to use for training,
167
168
for example, ``'ml.c4.xlarge'``. Required if instance_groups is
168
169
not set.
169
- volume_size (int): Size in GB of the EBS volume to use for
170
- storing input data during training (default: 30). Must be large
171
- enough to store training data if File Mode is used (which is the
172
- default).
170
+ volume_size (int): Size in GB of the storage volume to use for
171
+ storing input and output data during training (default: 30).
172
+
173
+ Must be large enough to store training data if File mode is
174
+ used, which is the default mode.
175
+
176
+ When you use an ML instance with the EBS-only storage option
177
+ such as ``ml.c5`` and ``ml.p2``,
178
+ you must define the size of the EBS
179
+ volume through the ``volume_size`` parameter in the estimator class.
180
+
181
+ .. note::
182
+
183
+ When you use an ML instance with `NVMe SSD volumes
184
+ <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ssd-instance-store.html#nvme-ssd-volumes>`_
185
+ such as ``ml.p4d``, ``ml.g4dn``, and ``ml.g5``,
186
+ do not include this parameter in the estimator configuration.
187
+ If you use one of those ML instance types,
188
+ SageMaker doesn't provision Amazon EBS General Purpose SSD
189
+ (gp2) storage nor take this parameter to adjust the NVMe instance storage.
190
+ Available storage is fixed to the NVMe instance storage
191
+ capacity. SageMaker configures storage paths for training
192
+ datasets, checkpoints, model artifacts, and outputs to use the
193
+ entire capacity of the instance storage.
194
+
195
+ Note that if you include this parameter and specify a number that
196
+ exceeds the size of the NVMe volume attached to the instance type,
197
+ SageMaker returns an ``Invalid VolumeSizeInGB`` error.
198
+
199
+ To look up instance types and their instance storage types
200
+ and volumes, see `Amazon EC2 Instance Types
201
+ <http://aws.amazon.com/ec2/instance-types/>`_.
202
+
203
+ To find the default local paths defined by the SageMaker
204
+ training platform, see `Amazon SageMaker Training Storage
205
+ Folders for Training Datasets, Checkpoints, Model Artifacts,
206
+ and Outputs
207
+ <https://docs.aws.amazon.com/sagemaker/latest/dg/model-train-storage.html>`_.
173
208
volume_kms_key (str): Optional. KMS key ID for encrypting EBS
174
209
volume attached to the training instance (default: None).
175
210
max_run (int): Timeout in seconds for training (default: 24 *
@@ -2232,12 +2267,46 @@ def __init__(
2232
2267
instance_count (int): Number of Amazon EC2 instances to use
2233
2268
for training. Required if instance_groups is not set.
2234
2269
instance_type (str): Type of EC2 instance to use for training,
2235
- for example, 'ml.c4.xlarge'. Required if instance_groups is
2270
+ for example, `` 'ml.c4.xlarge'`` . Required if instance_groups is
2236
2271
not set.
2237
- volume_size (int): Size in GB of the EBS volume to use for
2238
- storing input data during training (default: 30). Must be large
2239
- enough to store training data if File Mode is used (which is the
2240
- default).
2272
+ volume_size (int): Size in GB of the storage volume to use for
2273
+ storing input and output data during training (default: 30).
2274
+
2275
+ Must be large enough to store training data if File mode is
2276
+ used, which is the default mode.
2277
+
2278
+ When you use an ML instance with the EBS-only storage option
2279
+ such as ``ml.c5`` and ``ml.p2``,
2280
+ you must define the size of the EBS
2281
+ volume through the ``volume_size`` parameter in the estimator class.
2282
+
2283
+ .. note::
2284
+
2285
+ When you use an ML instance with `NVMe SSD volumes
2286
+ <https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ssd-instance-store.html#nvme-ssd-volumes>`_
2287
+ such as ``ml.p4d``, ``ml.g4dn``, and ``ml.g5``,
2288
+ do not include this parameter in the estimator configuration.
2289
+ If you use one of those ML instance types,
2290
+ SageMaker doesn't provision Amazon EBS General Purpose SSD
2291
+ (gp2) storage nor take this parameter to adjust the NVMe instance storage.
2292
+ Available storage is fixed to the NVMe instance storage
2293
+ capacity. SageMaker configures storage paths for training
2294
+ datasets, checkpoints, model artifacts, and outputs to use the
2295
+ entire capacity of the instance storage.
2296
+
2297
+ Note that if you include this parameter and specify a number that
2298
+ exceeds the size of the NVMe volume attached to the instance type,
2299
+ SageMaker returns an ``Invalid VolumeSizeInGB`` error.
2300
+
2301
+ To look up instance types and their instance storage types
2302
+ and volumes, see `Amazon EC2 Instance Types
2303
+ <http://aws.amazon.com/ec2/instance-types/>`_.
2304
+
2305
+ To find the default local paths defined by the SageMaker
2306
+ training platform, see `Amazon SageMaker Training Storage
2307
+ Folders for Training Datasets, Checkpoints, Model Artifacts,
2308
+ and Outputs
2309
+ <https://docs.aws.amazon.com/sagemaker/latest/dg/model-train-storage.html>`_.
2241
2310
volume_kms_key (str): Optional. KMS key ID for encrypting EBS
2242
2311
volume attached to the training instance (default: None).
2243
2312
max_run (int): Timeout in seconds for training (default: 24 *
@@ -3248,6 +3317,10 @@ def _distribution_configuration(self, distribution):
3248
3317
"instance_groups"
3249
3318
]
3250
3319
3320
+ if "pytorchxla" in distribution :
3321
+ pt_xla_enabled = distribution .get ("pytorchxla" ).get ("enabled" , False )
3322
+ distribution_config [self .LAUNCH_PT_XLA_ENV_NAME ] = pt_xla_enabled
3323
+
3251
3324
if "parameter_server" in distribution :
3252
3325
ps_enabled = distribution .get ("parameter_server" ).get ("enabled" , False )
3253
3326
distribution_config [self .LAUNCH_PS_ENV_NAME ] = ps_enabled
0 commit comments