Skip to content

Commit c3d158c

Browse files
authored
Merge branch 'master-jumpstart' into feat/jumpstart-retrieve-functions
2 parents 8d95d30 + ac57772 commit c3d158c

File tree

68 files changed

+5700
-786
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+5700
-786
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,5 @@ venv/
2828
.docker/
2929
env/
3030
.vscode/
31-
**/tmp
31+
**/tmp
32+
.python-version

CHANGELOG.md

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,65 @@
11
# Changelog
22

3+
## v2.72.3 (2022-01-10)
4+
5+
### Features
6+
7+
* default repack encryption
8+
* support large pipeline
9+
* add support for pytorch 1.10.0
10+
11+
### Documentation Changes
12+
13+
* SageMaker model parallel library 1.6.0 API doc
14+
15+
### Bug Fixes and Other Changes
16+
17+
* Model Registration with BYO scripts
18+
* Add ContentType in test_auto_ml_describe
19+
* Re-deploy static integ test endpoint if it is not found
20+
* fix kmeans test deletion sequence, increment lineage statics
21+
* Increment static lineage pipeline
22+
* Fix lineage query integ tests
23+
* Add label_headers option for Clarify ModelExplainabilityMonitor
24+
* Add action type to lineage object
25+
* Collapse cross-account artifacts in query lineage response
26+
* Update CHANGELOG.md to remove defaulting dot characters
27+
28+
## v2.72.2 (2022-01-06)
29+
30+
### Bug Fixes and Other Changes
31+
32+
* Update CHANGELOG.md
33+
* Increment static lineage pipeline
34+
* fix kmeans test deletion sequence, increment lineage statics
35+
* Re-deploy static integ test endpoint if it is not found
36+
* Add ContentType in test_auto_ml_describe
37+
* Model Registration with BYO scripts
38+
39+
### Documentation Changes
40+
41+
* SageMaker model parallel library 1.6.0 API doc
42+
43+
## v2.72.1 (2021-12-20)
44+
45+
### Bug Fixes and Other Changes
46+
47+
* typos and broken link
48+
* S3Input - add support for instance attributes
49+
* Prevent repack_model script from referencing nonexistent directories
50+
* Set ProcessingStep upload locations deterministically to avoid cache
51+
52+
## v2.72.0 (2021-12-13)
53+
54+
### Features
55+
56+
* allow conditional parellel builds
57+
58+
### Bug Fixes and Other Changes
59+
60+
* local mode - support relative file structure
61+
* fix endpoint bug
62+
363
## v2.71.0 (2021-12-06)
464

565
### Features

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.71.1.dev0
1+
2.72.4.dev0

ci-scripts/queue_build.py

Lines changed: 94 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -23,100 +23,138 @@
2323
).get_caller_identity()["Account"]
2424
bucket_name = "sagemaker-us-west-2-%s" % account
2525

26+
MAX_IN_PROGRESS_BUILDS = 3
27+
INTERVAL_BETWEEN_CONCURRENT_RUNS = 15 # minutes
28+
CLEAN_UP_TICKETS_OLDER_THAN = 8 # hours
29+
2630

2731
def queue_build():
28-
build_id = re.sub("[_/]", "-", os.environ.get("CODEBUILD_BUILD_ID", "CODEBUILD-BUILD-ID"))
29-
source_version = re.sub(
30-
"[_/]",
31-
"-",
32-
os.environ.get("CODEBUILD_SOURCE_VERSION", "CODEBUILD-SOURCE-VERSION"),
33-
)
3432
ticket_number = int(1000 * time.time())
35-
filename = "%s_%s_%s" % (ticket_number, build_id, source_version)
36-
37-
print("Created queue ticket %s" % ticket_number)
38-
39-
_write_ticket(filename)
4033
files = _list_tickets()
41-
_cleanup_tickets_older_than_8_hours(files)
42-
_wait_for_other_builds(files, ticket_number)
34+
_cleanup_tickets_older_than(files)
35+
_wait_for_other_builds(ticket_number)
4336

4437

4538
def _build_info_from_file(file):
46-
filename = file.key.split("/")[1]
39+
filename = file.key.split("/")[2]
4740
ticket_number, build_id, source_version = filename.split("_")
4841
return int(ticket_number), build_id, source_version
4942

5043

51-
def _wait_for_other_builds(files, ticket_number):
52-
newfiles = list(filter(lambda file: not _file_older_than(file), files))
53-
sorted_files = list(sorted(newfiles, key=lambda y: y.key))
44+
def _wait_for_other_builds(ticket_number):
45+
sorted_files = _list_tickets()
5446

5547
print("build queue status:")
5648
print()
5749

5850
for order, file in enumerate(sorted_files):
5951
file_ticket_number, build_id, source_version = _build_info_from_file(file)
6052
print(
61-
"%s -> %s %s, ticket number: %s" % (order, build_id, source_version, file_ticket_number)
53+
"%s -> %s %s, ticket number: %s status: %s"
54+
% (order, build_id, source_version, file_ticket_number, file.key.split("/")[1])
6255
)
56+
print()
57+
build_id = re.sub("[_/]", "-", os.environ.get("CODEBUILD_BUILD_ID", "CODEBUILD-BUILD-ID"))
58+
source_version = re.sub(
59+
"[_/]",
60+
"-",
61+
os.environ.get("CODEBUILD_SOURCE_VERSION", "CODEBUILD-SOURCE-VERSION"),
62+
)
63+
filename = "%s_%s_%s" % (ticket_number, build_id, source_version)
64+
s3_file_obj = _write_ticket(filename, status="waiting")
65+
print("Build %s waiting to be scheduled" % filename)
66+
67+
while True:
68+
_cleanup_tickets_with_terminal_states()
69+
waiting_tickets = _list_tickets("waiting")
70+
if waiting_tickets:
71+
first_waiting_ticket_number, _, _ = _build_info_from_file(_list_tickets("waiting")[0])
72+
else:
73+
first_waiting_ticket_number = ticket_number
74+
75+
if (
76+
len(_list_tickets(status="in-progress")) < 3
77+
and last_in_progress_elapsed_time_check()
78+
and first_waiting_ticket_number == ticket_number
79+
):
80+
# put the build in progress
81+
print("Scheduling build %s for running.." % filename)
82+
s3_file_obj.delete()
83+
_write_ticket(filename, status="in-progress")
84+
break
85+
else:
86+
# wait
87+
time.sleep(30)
6388

64-
for file in sorted_files:
65-
file_ticket_number, build_id, source_version = _build_info_from_file(file)
6689

67-
if file_ticket_number == ticket_number:
90+
def last_in_progress_elapsed_time_check():
91+
in_progress_tickets = _list_tickets("in-progress")
92+
if not in_progress_tickets:
93+
return True
94+
last_in_progress_ticket, _, _ = _build_info_from_file(_list_tickets("in-progress")[-1])
95+
_elapsed_time = int(1000 * time.time()) - last_in_progress_ticket
96+
last_in_progress_elapsed_time = int(_elapsed_time / (1000 * 60)) # in minutes
97+
return last_in_progress_elapsed_time > INTERVAL_BETWEEN_CONCURRENT_RUNS
6898

69-
break
70-
else:
71-
while True:
72-
client = boto3.client("codebuild")
73-
response = client.batch_get_builds(ids=[build_id])
74-
build_status = response["builds"][0]["buildStatus"]
75-
76-
if build_status == "IN_PROGRESS":
77-
print(
78-
"waiting on build %s %s %s" % (build_id, source_version, file_ticket_number)
79-
)
80-
time.sleep(30)
81-
else:
82-
print("build %s finished, deleting lock" % build_id)
83-
file.delete()
84-
break
85-
86-
87-
def _cleanup_tickets_older_than_8_hours(files):
99+
100+
def _cleanup_tickets_with_terminal_states():
101+
files = _list_tickets()
102+
build_ids = []
103+
for file in files:
104+
_, build_id, _ = _build_info_from_file(file)
105+
build_ids.append(build_id)
106+
107+
client = boto3.client("codebuild")
108+
response = client.batch_get_builds(ids=build_ids)
109+
110+
for file, build_details in zip(files, response["builds"]):
111+
_, _build_id_from_file, _ = _build_info_from_file(file)
112+
build_status = build_details["buildStatus"]
113+
114+
if build_status != "IN_PROGRESS" and _build_id_from_file == build_details["id"]:
115+
print(
116+
"Build %s in terminal state: %s, deleting lock"
117+
% (_build_id_from_file, build_status)
118+
)
119+
file.delete()
120+
121+
122+
def _cleanup_tickets_older_than(files):
88123
oldfiles = list(filter(_file_older_than, files))
89124
for file in oldfiles:
90125
print("object %s older than 8 hours. Deleting" % file.key)
91126
file.delete()
92127
return files
93128

94129

95-
def _list_tickets():
130+
def _list_tickets(status=None):
96131
s3 = boto3.resource("s3")
97132
bucket = s3.Bucket(bucket_name)
98-
objects = [file for file in bucket.objects.filter(Prefix="ci-lock/")]
99-
files = list(filter(lambda x: x != "ci-lock/", objects))
100-
return files
133+
prefix = "ci-integ-queue/{}/".format(status) if status else "ci-integ-queue/"
134+
objects = [file for file in bucket.objects.filter(Prefix=prefix)]
135+
files = list(filter(lambda x: x != prefix, objects))
136+
sorted_files = list(sorted(files, key=lambda y: y.key))
137+
return sorted_files
101138

102139

103140
def _file_older_than(file):
104-
timelimit = 1000 * 60 * 60 * 8
105-
141+
timelimit = 1000 * 60 * 60 * CLEAN_UP_TICKETS_OLDER_THAN
106142
file_ticket_number, build_id, source_version = _build_info_from_file(file)
143+
return int(1000 * time.time()) - file_ticket_number > timelimit
107144

108-
return int(time.time()) - file_ticket_number > timelimit
109-
110-
111-
def _write_ticket(ticket_number):
112145

113-
if not os.path.exists("ci-lock"):
114-
os.mkdir("ci-lock")
146+
def _write_ticket(filename, status="waiting"):
147+
file_path = "ci-integ-queue/{}".format(status)
148+
if not os.path.exists(file_path):
149+
os.makedirs(file_path)
115150

116-
filename = "ci-lock/" + ticket_number
117-
with open(filename, "w") as file:
118-
file.write(ticket_number)
119-
boto3.Session().resource("s3").Object(bucket_name, filename).upload_file(filename)
151+
file_full_path = file_path + "/" + filename
152+
with open(file_full_path, "w") as file:
153+
file.write(filename)
154+
s3_file_obj = boto3.Session().resource("s3").Object(bucket_name, file_full_path)
155+
s3_file_obj.upload_file(file_full_path)
156+
print("Build %s is now in state %s" % (filename, status))
157+
return s3_file_obj
120158

121159

122160
if __name__ == "__main__":

doc/api/training/smd_data_parallel.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
##########################
2-
Distributed data parallel
3-
##########################
1+
###############################################
2+
The SageMaker Distributed Data Parallel Library
3+
###############################################
44

55
SageMaker's distributed data parallel library extends SageMaker’s training
66
capabilities on deep learning models with near-linear scaling efficiency,

doc/api/training/smd_model_parallel.rst

Lines changed: 25 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
Distributed model parallel
2-
--------------------------
1+
The SageMaker Distributed Model Parallel Library
2+
------------------------------------------------
33

44
The Amazon SageMaker distributed model parallel library is a model parallelism library for training
55
large deep learning models that were previously difficult to train due to GPU memory limitations.
@@ -9,49 +9,35 @@ allowing you to increase prediction accuracy by creating larger models with more
99
You can use the library to automatically partition your existing TensorFlow and PyTorch workloads
1010
across multiple GPUs with minimal code changes. The library's API can be accessed through the Amazon SageMaker SDK.
1111

12-
Use the following sections to learn more about the model parallelism and the library.
13-
14-
Use with the SageMaker Python SDK
15-
=================================
16-
17-
Use the following page to learn how to configure and enable distributed model parallel
18-
when you configure an Amazon SageMaker Python SDK `Estimator`.
12+
See the following sections to learn more about the SageMaker model parallel library APIs.
1913

2014
.. toctree::
21-
:maxdepth: 1
15+
:maxdepth: 3
2216

17+
smp_versions/latest
2318
smd_model_parallel_general
2419

25-
API Documentation
26-
=================
27-
28-
The library contains a Common API that is shared across frameworks, as well as APIs
29-
that are specific to supported frameworks, TensorFlow and PyTorch.
30-
31-
Select a version to see the API documentation for version. To use the library, reference the
32-
**Common API** documentation alongside the framework specific API documentation.
33-
34-
.. toctree::
35-
:maxdepth: 1
36-
37-
smp_versions/latest.rst
38-
smp_versions/v1_3_0.rst
39-
smp_versions/v1_2_0.rst
40-
smp_versions/v1_1_0.rst
41-
42-
It is recommended to use this documentation alongside `SageMaker Distributed Model Parallel
43-
<http://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel.html>`__ in the Amazon SageMaker
44-
developer guide. This developer guide documentation includes:
4520

46-
- An overview of model parallelism and the library
47-
`core features <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features.html>`__
48-
- Instructions on how to modify `TensorFlow
49-
<https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script.html#model-parallel-customize-training-script-tf>`__
50-
and `PyTorch
51-
<https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script.html#model-parallel-customize-training-script-pt>`__
52-
training scripts
53-
- `Configuration tips and pitfalls
54-
<https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-tips-pitfalls.html>`__
21+
.. tip::
22+
23+
We recommended using this API documentation with the conceptual guide at
24+
`SageMaker's Distributed Model Parallel
25+
<http://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel.html>`_
26+
in the *Amazon SageMaker developer guide*. This developer guide documentation includes:
27+
28+
- An overview of model parallelism, and the library's
29+
`core features <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features.html>`_,
30+
and `extended features for PyTorch <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-extended-features-pytorch.html>`_.
31+
- Instructions on how to modify `TensorFlow
32+
<https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script-tf.html>`_
33+
and `PyTorch
34+
<https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script-pt.html>`_
35+
training scripts.
36+
- Instructions on how to `run a distributed training job using the SageMaker Python SDK
37+
and the SageMaker model parallel library
38+
<https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-sm-sdk.html>`_.
39+
- `Configuration tips and pitfalls
40+
<https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-tips-pitfalls.html>`_.
5541

5642

5743
.. important::

0 commit comments

Comments
 (0)