Skip to content

Commit b3276e9

Browse files
authored
Merge branch 'aws:master' into logruntag
2 parents a580412 + 8462f1a commit b3276e9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+2337
-1610
lines changed

CHANGELOG.md

+48
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,53 @@
11
# Changelog
22

3+
## v2.197.0 (2023-11-07)
4+
5+
### Features
6+
7+
* PT2.1 SM Training/Inference DLC Release
8+
9+
### Bug Fixes and Other Changes
10+
11+
* Release HuggingFace PT Neuronx training image 1.13.1
12+
* HuggingFace PT Neuronx release in SDK
13+
14+
## v2.196.0 (2023-10-27)
15+
16+
### Features
17+
18+
* inference instance type conditioned on training instance type
19+
20+
### Bug Fixes and Other Changes
21+
22+
* improved jumpstart tagging
23+
24+
## v2.195.1 (2023-10-26)
25+
26+
### Bug Fixes and Other Changes
27+
28+
* Allow either instance_type or instance_group to be defined in…
29+
* enhance image_uris unit tests
30+
31+
## v2.195.0 (2023-10-25)
32+
33+
### Features
34+
35+
* jumpstart gated model artifacts
36+
* jumpstart extract generated text from response
37+
* jumpstart contruct payload utility
38+
39+
### Bug Fixes and Other Changes
40+
41+
* relax upper bound on urllib in local mode requirements
42+
* bump urllib3 version
43+
* allow smdistributed to be enabled with torch_distributed.
44+
* fix URL links
45+
46+
### Documentation Changes
47+
48+
* remove python 2 reference
49+
* update framework version links
50+
351
## v2.194.0 (2023-10-19)
452

553
### Features

VERSION

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.194.1.dev0
1+
2.197.1.dev0

doc/overview.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ After you train a model, you can save it, and then serve the model as an endpoin
3232
Prepare a Training script
3333
=========================
3434

35-
Your training script must be a Python 2.7 or 3.6 compatible source file.
35+
Your training script must be a 3.6 compatible source file.
3636

3737
The training script is very similar to a training script you might run outside of SageMaker, but you can access useful properties about the training environment through various environment variables, including the following:
3838

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
urllib3>=1.26.8,<1.26.15
1+
urllib3>=1.26.8,<3.0.0
22
docker>=5.0.2,<7.0.0
33
PyYAML>=5.4.1,<7

requirements/extras/test_requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ pandas>=1.3.5,<1.5
2424
scikit-learn==1.3.0
2525
cloudpickle==2.2.1
2626
scipy==1.10.1
27-
urllib3>=1.26.8,<1.26.15
27+
urllib3>=1.26.8,<3.0.0
2828
docker>=5.0.2,<7.0.0
2929
PyYAML==6.0
3030
pyspark==3.3.1

src/sagemaker/estimator.py

+17-8
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@
7171
from sagemaker.utils import instance_supports_kms
7272
from sagemaker.job import _Job
7373
from sagemaker.jumpstart.utils import (
74-
add_jumpstart_tags,
74+
add_jumpstart_uri_tags,
7575
get_jumpstart_base_name_if_jumpstart_model,
7676
update_inference_tags_with_jumpstart_training_tags,
7777
)
@@ -577,9 +577,7 @@ def __init__(
577577
self.entry_point = entry_point
578578
self.dependencies = dependencies or []
579579
self.uploaded_code: Optional[UploadedCode] = None
580-
self.tags = add_jumpstart_tags(
581-
tags=tags, training_model_uri=self.model_uri, training_script_uri=self.source_dir
582-
)
580+
583581
if self.instance_type in ("local", "local_gpu"):
584582
if self.instance_type == "local_gpu" and self.instance_count > 1:
585583
raise RuntimeError("Distributed Training in Local GPU is not supported")
@@ -592,6 +590,15 @@ def __init__(
592590
else:
593591
self.sagemaker_session = sagemaker_session or Session()
594592

593+
self.tags = (
594+
add_jumpstart_uri_tags(
595+
tags=tags, training_model_uri=self.model_uri, training_script_uri=self.source_dir
596+
)
597+
if getattr(self.sagemaker_session, "settings", None) is not None
598+
and self.sagemaker_session.settings.include_jumpstart_tags
599+
else tags
600+
)
601+
595602
self.base_job_name = base_job_name
596603
self._current_job_name = None
597604
if (
@@ -3818,6 +3825,7 @@ def _distribution_configuration(self, distribution):
38183825

38193826
mpi_enabled = False
38203827
smdataparallel_enabled = False
3828+
p5_enabled = False
38213829
if "instance_groups" in distribution:
38223830
distribution_config["sagemaker_distribution_instance_groups"] = distribution[
38233831
"instance_groups"
@@ -3862,10 +3870,11 @@ def _distribution_configuration(self, distribution):
38623870
elif isinstance(self.instance_type, str):
38633871
p5_enabled = "p5.48xlarge" in self.instance_type
38643872
else:
3865-
raise ValueError(
3866-
"Invalid object type for instance_type argument. Expected "
3867-
f"{type(str)} or {type(ParameterString)} but got {type(self.instance_type)}."
3868-
)
3873+
for instance in self.instance_groups:
3874+
if "p5.48xlarge" in instance._to_request_dict().get("InstanceType", ()):
3875+
p5_enabled = True
3876+
break
3877+
38693878
img_uri = "" if self.image_uri is None else self.image_uri
38703879
for unsupported_image in Framework.UNSUPPORTED_DLC_IMAGE_FOR_SM_PARALLELISM:
38713880
if (

src/sagemaker/fw_utils.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@
138138
"1.12.1",
139139
"1.13.1",
140140
"2.0.0",
141+
"2.0.1",
141142
],
142143
}
143144

@@ -153,10 +154,11 @@
153154
"1.13.1",
154155
"2.0.0",
155156
"2.0.1",
157+
"2.1.0",
156158
]
157159

158160

159-
TORCH_DISTRIBUTED_GPU_SUPPORTED_FRAMEWORK_VERSIONS = ["1.13.1", "2.0.0", "2.0.1"]
161+
TORCH_DISTRIBUTED_GPU_SUPPORTED_FRAMEWORK_VERSIONS = ["1.13.1", "2.0.0", "2.0.1", "2.1.0"]
160162

161163
TRAINIUM_SUPPORTED_DISTRIBUTION_STRATEGIES = ["torch_distributed"]
162164
TRAINIUM_SUPPORTED_TORCH_DISTRIBUTED_FRAMEWORK_VERSIONS = [

src/sagemaker/image_uri_config/huggingface-neuronx.json

+86
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,49 @@
4545
"container_version": {"trn": "ubuntu20.04"},
4646
"sdk_versions": ["sdk2.9.1"]
4747
}
48+
},
49+
"4.34.1": {
50+
"version_aliases": {"pytorch1.13": "pytorch1.13.1"},
51+
"pytorch1.13.1": {
52+
"py_versions": ["py310"],
53+
"repository": "huggingface-pytorch-inference-neuronx",
54+
"registries": {
55+
"af-south-1": "626614931356",
56+
"il-central-1": "780543022126",
57+
"ap-east-1": "871362719292",
58+
"ap-northeast-1": "763104351884",
59+
"ap-northeast-2": "763104351884",
60+
"ap-northeast-3": "364406365360",
61+
"ap-south-1": "763104351884",
62+
"ap-south-2": "772153158452",
63+
"ap-southeast-1": "763104351884",
64+
"ap-southeast-2": "763104351884",
65+
"ap-southeast-4": "457447274322",
66+
"ca-central-1": "763104351884",
67+
"cn-north-1": "727897471807",
68+
"cn-northwest-1": "727897471807",
69+
"eu-central-1": "763104351884",
70+
"eu-central-2": "380420809688",
71+
"eu-north-1": "763104351884",
72+
"eu-west-1": "763104351884",
73+
"eu-west-2": "763104351884",
74+
"eu-west-3": "763104351884",
75+
"eu-south-1": "692866216735",
76+
"eu-south-2": "503227376785",
77+
"me-south-1": "217643126080",
78+
"sa-east-1": "763104351884",
79+
"us-east-1": "763104351884",
80+
"us-east-2": "763104351884",
81+
"us-gov-east-1": "446045086412",
82+
"us-gov-west-1": "442386744353",
83+
"us-iso-east-1": "886529160074",
84+
"us-isob-east-1": "094389454867",
85+
"us-west-1": "763104351884",
86+
"us-west-2": "763104351884"
87+
},
88+
"container_version": {"inf": "ubuntu20.04"},
89+
"sdk_versions": ["sdk2.15.0"]
90+
}
4891
}
4992
}
5093
},
@@ -94,6 +137,49 @@
94137
"container_version": {"inf": "ubuntu20.04"},
95138
"sdk_versions": ["sdk2.9.1"]
96139
}
140+
},
141+
"4.34.1": {
142+
"version_aliases": {"pytorch1.13": "pytorch1.13.1"},
143+
"pytorch1.13.1": {
144+
"py_versions": ["py310"],
145+
"repository": "huggingface-pytorch-inference-neuronx",
146+
"registries": {
147+
"af-south-1": "626614931356",
148+
"il-central-1": "780543022126",
149+
"ap-east-1": "871362719292",
150+
"ap-northeast-1": "763104351884",
151+
"ap-northeast-2": "763104351884",
152+
"ap-northeast-3": "364406365360",
153+
"ap-south-1": "763104351884",
154+
"ap-south-2": "772153158452",
155+
"ap-southeast-1": "763104351884",
156+
"ap-southeast-2": "763104351884",
157+
"ap-southeast-4": "457447274322",
158+
"ca-central-1": "763104351884",
159+
"cn-north-1": "727897471807",
160+
"cn-northwest-1": "727897471807",
161+
"eu-central-1": "763104351884",
162+
"eu-central-2": "380420809688",
163+
"eu-north-1": "763104351884",
164+
"eu-west-1": "763104351884",
165+
"eu-west-2": "763104351884",
166+
"eu-west-3": "763104351884",
167+
"eu-south-1": "692866216735",
168+
"eu-south-2": "503227376785",
169+
"me-south-1": "217643126080",
170+
"sa-east-1": "763104351884",
171+
"us-east-1": "763104351884",
172+
"us-east-2": "763104351884",
173+
"us-gov-east-1": "446045086412",
174+
"us-gov-west-1": "442386744353",
175+
"us-iso-east-1": "886529160074",
176+
"us-isob-east-1": "094389454867",
177+
"us-west-1": "763104351884",
178+
"us-west-2": "763104351884"
179+
},
180+
"container_version": {"inf": "ubuntu20.04"},
181+
"sdk_versions": ["sdk2.15.0"]
182+
}
97183
}
98184
}
99185
}

0 commit comments

Comments
 (0)