Skip to content

Commit 4c1a1e1

Browse files
committed
Merge branch 'master' of https://github.com/aws/sagemaker-python-sdk into smdmp-1.6.0-doc
2 parents 35534b5 + a3b588c commit 4c1a1e1

File tree

15 files changed

+582
-206
lines changed

15 files changed

+582
-206
lines changed

CHANGELOG.md

+11
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,16 @@
11
# Changelog
22

3+
## v2.72.0 (2021-12-13)
4+
5+
### Features
6+
7+
* allow conditional parellel builds
8+
9+
### Bug Fixes and Other Changes
10+
11+
* local mode - support relative file structure
12+
* fix endpoint bug
13+
314
## v2.71.0 (2021-12-06)
415

516
### Features

VERSION

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.71.1.dev0
1+
2.72.1.dev0

ci-scripts/queue_build.py

+94-56
Original file line numberDiff line numberDiff line change
@@ -23,100 +23,138 @@
2323
).get_caller_identity()["Account"]
2424
bucket_name = "sagemaker-us-west-2-%s" % account
2525

26+
MAX_IN_PROGRESS_BUILDS = 3
27+
INTERVAL_BETWEEN_CONCURRENT_RUNS = 15 # minutes
28+
CLEAN_UP_TICKETS_OLDER_THAN = 8 # hours
29+
2630

2731
def queue_build():
28-
build_id = re.sub("[_/]", "-", os.environ.get("CODEBUILD_BUILD_ID", "CODEBUILD-BUILD-ID"))
29-
source_version = re.sub(
30-
"[_/]",
31-
"-",
32-
os.environ.get("CODEBUILD_SOURCE_VERSION", "CODEBUILD-SOURCE-VERSION"),
33-
)
3432
ticket_number = int(1000 * time.time())
35-
filename = "%s_%s_%s" % (ticket_number, build_id, source_version)
36-
37-
print("Created queue ticket %s" % ticket_number)
38-
39-
_write_ticket(filename)
4033
files = _list_tickets()
41-
_cleanup_tickets_older_than_8_hours(files)
42-
_wait_for_other_builds(files, ticket_number)
34+
_cleanup_tickets_older_than(files)
35+
_wait_for_other_builds(ticket_number)
4336

4437

4538
def _build_info_from_file(file):
46-
filename = file.key.split("/")[1]
39+
filename = file.key.split("/")[2]
4740
ticket_number, build_id, source_version = filename.split("_")
4841
return int(ticket_number), build_id, source_version
4942

5043

51-
def _wait_for_other_builds(files, ticket_number):
52-
newfiles = list(filter(lambda file: not _file_older_than(file), files))
53-
sorted_files = list(sorted(newfiles, key=lambda y: y.key))
44+
def _wait_for_other_builds(ticket_number):
45+
sorted_files = _list_tickets()
5446

5547
print("build queue status:")
5648
print()
5749

5850
for order, file in enumerate(sorted_files):
5951
file_ticket_number, build_id, source_version = _build_info_from_file(file)
6052
print(
61-
"%s -> %s %s, ticket number: %s" % (order, build_id, source_version, file_ticket_number)
53+
"%s -> %s %s, ticket number: %s status: %s"
54+
% (order, build_id, source_version, file_ticket_number, file.key.split("/")[1])
6255
)
56+
print()
57+
build_id = re.sub("[_/]", "-", os.environ.get("CODEBUILD_BUILD_ID", "CODEBUILD-BUILD-ID"))
58+
source_version = re.sub(
59+
"[_/]",
60+
"-",
61+
os.environ.get("CODEBUILD_SOURCE_VERSION", "CODEBUILD-SOURCE-VERSION"),
62+
)
63+
filename = "%s_%s_%s" % (ticket_number, build_id, source_version)
64+
s3_file_obj = _write_ticket(filename, status="waiting")
65+
print("Build %s waiting to be scheduled" % filename)
66+
67+
while True:
68+
_cleanup_tickets_with_terminal_states()
69+
waiting_tickets = _list_tickets("waiting")
70+
if waiting_tickets:
71+
first_waiting_ticket_number, _, _ = _build_info_from_file(_list_tickets("waiting")[0])
72+
else:
73+
first_waiting_ticket_number = ticket_number
74+
75+
if (
76+
len(_list_tickets(status="in-progress")) < 3
77+
and last_in_progress_elapsed_time_check()
78+
and first_waiting_ticket_number == ticket_number
79+
):
80+
# put the build in progress
81+
print("Scheduling build %s for running.." % filename)
82+
s3_file_obj.delete()
83+
_write_ticket(filename, status="in-progress")
84+
break
85+
else:
86+
# wait
87+
time.sleep(30)
6388

64-
for file in sorted_files:
65-
file_ticket_number, build_id, source_version = _build_info_from_file(file)
6689

67-
if file_ticket_number == ticket_number:
90+
def last_in_progress_elapsed_time_check():
91+
in_progress_tickets = _list_tickets("in-progress")
92+
if not in_progress_tickets:
93+
return True
94+
last_in_progress_ticket, _, _ = _build_info_from_file(_list_tickets("in-progress")[-1])
95+
_elapsed_time = int(1000 * time.time()) - last_in_progress_ticket
96+
last_in_progress_elapsed_time = int(_elapsed_time / (1000 * 60)) # in minutes
97+
return last_in_progress_elapsed_time > INTERVAL_BETWEEN_CONCURRENT_RUNS
6898

69-
break
70-
else:
71-
while True:
72-
client = boto3.client("codebuild")
73-
response = client.batch_get_builds(ids=[build_id])
74-
build_status = response["builds"][0]["buildStatus"]
75-
76-
if build_status == "IN_PROGRESS":
77-
print(
78-
"waiting on build %s %s %s" % (build_id, source_version, file_ticket_number)
79-
)
80-
time.sleep(30)
81-
else:
82-
print("build %s finished, deleting lock" % build_id)
83-
file.delete()
84-
break
85-
86-
87-
def _cleanup_tickets_older_than_8_hours(files):
99+
100+
def _cleanup_tickets_with_terminal_states():
101+
files = _list_tickets()
102+
build_ids = []
103+
for file in files:
104+
_, build_id, _ = _build_info_from_file(file)
105+
build_ids.append(build_id)
106+
107+
client = boto3.client("codebuild")
108+
response = client.batch_get_builds(ids=build_ids)
109+
110+
for file, build_details in zip(files, response["builds"]):
111+
_, _build_id_from_file, _ = _build_info_from_file(file)
112+
build_status = build_details["buildStatus"]
113+
114+
if build_status != "IN_PROGRESS" and _build_id_from_file == build_details["id"]:
115+
print(
116+
"Build %s in terminal state: %s, deleting lock"
117+
% (_build_id_from_file, build_status)
118+
)
119+
file.delete()
120+
121+
122+
def _cleanup_tickets_older_than(files):
88123
oldfiles = list(filter(_file_older_than, files))
89124
for file in oldfiles:
90125
print("object %s older than 8 hours. Deleting" % file.key)
91126
file.delete()
92127
return files
93128

94129

95-
def _list_tickets():
130+
def _list_tickets(status=None):
96131
s3 = boto3.resource("s3")
97132
bucket = s3.Bucket(bucket_name)
98-
objects = [file for file in bucket.objects.filter(Prefix="ci-lock/")]
99-
files = list(filter(lambda x: x != "ci-lock/", objects))
100-
return files
133+
prefix = "ci-integ-queue/{}/".format(status) if status else "ci-integ-queue/"
134+
objects = [file for file in bucket.objects.filter(Prefix=prefix)]
135+
files = list(filter(lambda x: x != prefix, objects))
136+
sorted_files = list(sorted(files, key=lambda y: y.key))
137+
return sorted_files
101138

102139

103140
def _file_older_than(file):
104-
timelimit = 1000 * 60 * 60 * 8
105-
141+
timelimit = 1000 * 60 * 60 * CLEAN_UP_TICKETS_OLDER_THAN
106142
file_ticket_number, build_id, source_version = _build_info_from_file(file)
143+
return int(1000 * time.time()) - file_ticket_number > timelimit
107144

108-
return int(time.time()) - file_ticket_number > timelimit
109-
110-
111-
def _write_ticket(ticket_number):
112145

113-
if not os.path.exists("ci-lock"):
114-
os.mkdir("ci-lock")
146+
def _write_ticket(filename, status="waiting"):
147+
file_path = "ci-integ-queue/{}".format(status)
148+
if not os.path.exists(file_path):
149+
os.makedirs(file_path)
115150

116-
filename = "ci-lock/" + ticket_number
117-
with open(filename, "w") as file:
118-
file.write(ticket_number)
119-
boto3.Session().resource("s3").Object(bucket_name, filename).upload_file(filename)
151+
file_full_path = file_path + "/" + filename
152+
with open(file_full_path, "w") as file:
153+
file.write(filename)
154+
s3_file_obj = boto3.Session().resource("s3").Object(bucket_name, file_full_path)
155+
s3_file_obj.upload_file(file_full_path)
156+
print("Build %s is now in state %s" % (filename, status))
157+
return s3_file_obj
120158

121159

122160
if __name__ == "__main__":

doc/frameworks/pytorch/using_pytorch.rst

+3-3
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ with the following:
8080
8181
# ... load from args.train and args.test, train a model, write model to args.model_dir.
8282
83-
Because the SageMaker imports your training script, you should put your training code in a main guard
83+
Because SageMaker imports your training script, you should put your training code in a main guard
8484
(``if __name__=='__main__':``) if you are using the same script to host your model, so that SageMaker does not
8585
inadvertently run your training code at the wrong point in execution.
8686

@@ -177,7 +177,7 @@ fit Required Arguments
177177
case, the S3 objects rooted at the ``my-training-data`` prefix will
178178
be available in the default ``train`` channel. A dict from
179179
string channel names to S3 URIs. In this case, the objects rooted at
180-
each S3 prefix will available as files in each channel directory.
180+
each S3 prefix will be available as files in each channel directory.
181181

182182
For example:
183183

@@ -391,7 +391,7 @@ If you are using PyTorch Elastic Inference 1.5.1, you should provide ``model_fn`
391391
The client-side Elastic Inference framework is CPU-only, even though inference still happens in a CUDA context on the server. Thus, the default ``model_fn`` for Elastic Inference loads the model to CPU. Tracing models may lead to tensor creation on a specific device, which may cause device-related errors when loading a model onto a different device. Providing an explicit ``map_location=torch.device('cpu')`` argument forces all tensors to CPU.
392392

393393
For more information on the default inference handler functions, please refer to:
394-
`SageMaker PyTorch Default Inference Handler <https://github.com/aws/sagemaker-pytorch-serving-container/blob/master/src/sagemaker_pytorch_serving_container/default_inference_handler.py>`_.
394+
`SageMaker PyTorch Default Inference Handler <https://github.com/aws/sagemaker-pytorch-inference-toolkit/blob/master/src/sagemaker_pytorch_serving_container/default_pytorch_inference_handler.py>`_.
395395

396396
Serve a PyTorch Model
397397
---------------------

0 commit comments

Comments
 (0)