Skip to content

Commit d0a7a4b

Browse files
authored
Merge branch 'dev' into removeDuplicate
2 parents c185c8d + 1ff8ae3 commit d0a7a4b

File tree

16 files changed

+703
-233
lines changed

16 files changed

+703
-233
lines changed

CHANGELOG.md

+14
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,19 @@
11
# Changelog
22

3+
## v2.71.0 (2021-12-06)
4+
5+
### Features
6+
7+
* Add support for TF 2.6
8+
* Adding PT 17/18 Repo
9+
* Add profile_name support for Feature Store ingestion
10+
11+
### Bug Fixes and Other Changes
12+
13+
* Fix non-existent variable name
14+
* Add TF 2.6.2 on training
15+
* Recreate static lineage test data
16+
317
## v2.70.0 (2021-12-02)
418

519
### Features

VERSION

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.70.1.dev0
1+
2.71.1.dev0

ci-scripts/queue_build.py

+94-56
Original file line numberDiff line numberDiff line change
@@ -23,100 +23,138 @@
2323
).get_caller_identity()["Account"]
2424
bucket_name = "sagemaker-us-west-2-%s" % account
2525

26+
MAX_IN_PROGRESS_BUILDS = 3
27+
INTERVAL_BETWEEN_CONCURRENT_RUNS = 15 # minutes
28+
CLEAN_UP_TICKETS_OLDER_THAN = 8 # hours
29+
2630

2731
def queue_build():
28-
build_id = re.sub("[_/]", "-", os.environ.get("CODEBUILD_BUILD_ID", "CODEBUILD-BUILD-ID"))
29-
source_version = re.sub(
30-
"[_/]",
31-
"-",
32-
os.environ.get("CODEBUILD_SOURCE_VERSION", "CODEBUILD-SOURCE-VERSION"),
33-
)
3432
ticket_number = int(1000 * time.time())
35-
filename = "%s_%s_%s" % (ticket_number, build_id, source_version)
36-
37-
print("Created queue ticket %s" % ticket_number)
38-
39-
_write_ticket(filename)
4033
files = _list_tickets()
41-
_cleanup_tickets_older_than_8_hours(files)
42-
_wait_for_other_builds(files, ticket_number)
34+
_cleanup_tickets_older_than(files)
35+
_wait_for_other_builds(ticket_number)
4336

4437

4538
def _build_info_from_file(file):
46-
filename = file.key.split("/")[1]
39+
filename = file.key.split("/")[2]
4740
ticket_number, build_id, source_version = filename.split("_")
4841
return int(ticket_number), build_id, source_version
4942

5043

51-
def _wait_for_other_builds(files, ticket_number):
52-
newfiles = list(filter(lambda file: not _file_older_than(file), files))
53-
sorted_files = list(sorted(newfiles, key=lambda y: y.key))
44+
def _wait_for_other_builds(ticket_number):
45+
sorted_files = _list_tickets()
5446

5547
print("build queue status:")
5648
print()
5749

5850
for order, file in enumerate(sorted_files):
5951
file_ticket_number, build_id, source_version = _build_info_from_file(file)
6052
print(
61-
"%s -> %s %s, ticket number: %s" % (order, build_id, source_version, file_ticket_number)
53+
"%s -> %s %s, ticket number: %s status: %s"
54+
% (order, build_id, source_version, file_ticket_number, file.key.split("/")[1])
6255
)
56+
print()
57+
build_id = re.sub("[_/]", "-", os.environ.get("CODEBUILD_BUILD_ID", "CODEBUILD-BUILD-ID"))
58+
source_version = re.sub(
59+
"[_/]",
60+
"-",
61+
os.environ.get("CODEBUILD_SOURCE_VERSION", "CODEBUILD-SOURCE-VERSION"),
62+
)
63+
filename = "%s_%s_%s" % (ticket_number, build_id, source_version)
64+
s3_file_obj = _write_ticket(filename, status="waiting")
65+
print("Build %s waiting to be scheduled" % filename)
66+
67+
while True:
68+
_cleanup_tickets_with_terminal_states()
69+
waiting_tickets = _list_tickets("waiting")
70+
if waiting_tickets:
71+
first_waiting_ticket_number, _, _ = _build_info_from_file(_list_tickets("waiting")[0])
72+
else:
73+
first_waiting_ticket_number = ticket_number
74+
75+
if (
76+
len(_list_tickets(status="in-progress")) < 3
77+
and last_in_progress_elapsed_time_check()
78+
and first_waiting_ticket_number == ticket_number
79+
):
80+
# put the build in progress
81+
print("Scheduling build %s for running.." % filename)
82+
s3_file_obj.delete()
83+
_write_ticket(filename, status="in-progress")
84+
break
85+
else:
86+
# wait
87+
time.sleep(30)
6388

64-
for file in sorted_files:
65-
file_ticket_number, build_id, source_version = _build_info_from_file(file)
6689

67-
if file_ticket_number == ticket_number:
90+
def last_in_progress_elapsed_time_check():
91+
in_progress_tickets = _list_tickets("in-progress")
92+
if not in_progress_tickets:
93+
return True
94+
last_in_progress_ticket, _, _ = _build_info_from_file(_list_tickets("in-progress")[-1])
95+
_elapsed_time = int(1000 * time.time()) - last_in_progress_ticket
96+
last_in_progress_elapsed_time = int(_elapsed_time / (1000 * 60)) # in minutes
97+
return last_in_progress_elapsed_time > INTERVAL_BETWEEN_CONCURRENT_RUNS
6898

69-
break
70-
else:
71-
while True:
72-
client = boto3.client("codebuild")
73-
response = client.batch_get_builds(ids=[build_id])
74-
build_status = response["builds"][0]["buildStatus"]
75-
76-
if build_status == "IN_PROGRESS":
77-
print(
78-
"waiting on build %s %s %s" % (build_id, source_version, file_ticket_number)
79-
)
80-
time.sleep(30)
81-
else:
82-
print("build %s finished, deleting lock" % build_id)
83-
file.delete()
84-
break
85-
86-
87-
def _cleanup_tickets_older_than_8_hours(files):
99+
100+
def _cleanup_tickets_with_terminal_states():
101+
files = _list_tickets()
102+
build_ids = []
103+
for file in files:
104+
_, build_id, _ = _build_info_from_file(file)
105+
build_ids.append(build_id)
106+
107+
client = boto3.client("codebuild")
108+
response = client.batch_get_builds(ids=build_ids)
109+
110+
for file, build_details in zip(files, response["builds"]):
111+
_, _build_id_from_file, _ = _build_info_from_file(file)
112+
build_status = build_details["buildStatus"]
113+
114+
if build_status != "IN_PROGRESS" and _build_id_from_file == build_details["id"]:
115+
print(
116+
"Build %s in terminal state: %s, deleting lock"
117+
% (_build_id_from_file, build_status)
118+
)
119+
file.delete()
120+
121+
122+
def _cleanup_tickets_older_than(files):
88123
oldfiles = list(filter(_file_older_than, files))
89124
for file in oldfiles:
90125
print("object %s older than 8 hours. Deleting" % file.key)
91126
file.delete()
92127
return files
93128

94129

95-
def _list_tickets():
130+
def _list_tickets(status=None):
96131
s3 = boto3.resource("s3")
97132
bucket = s3.Bucket(bucket_name)
98-
objects = [file for file in bucket.objects.filter(Prefix="ci-lock/")]
99-
files = list(filter(lambda x: x != "ci-lock/", objects))
100-
return files
133+
prefix = "ci-integ-queue/{}/".format(status) if status else "ci-integ-queue/"
134+
objects = [file for file in bucket.objects.filter(Prefix=prefix)]
135+
files = list(filter(lambda x: x != prefix, objects))
136+
sorted_files = list(sorted(files, key=lambda y: y.key))
137+
return sorted_files
101138

102139

103140
def _file_older_than(file):
104-
timelimit = 1000 * 60 * 60 * 8
105-
141+
timelimit = 1000 * 60 * 60 * CLEAN_UP_TICKETS_OLDER_THAN
106142
file_ticket_number, build_id, source_version = _build_info_from_file(file)
143+
return int(1000 * time.time()) - file_ticket_number > timelimit
107144

108-
return int(time.time()) - file_ticket_number > timelimit
109-
110-
111-
def _write_ticket(ticket_number):
112145

113-
if not os.path.exists("ci-lock"):
114-
os.mkdir("ci-lock")
146+
def _write_ticket(filename, status="waiting"):
147+
file_path = "ci-integ-queue/{}".format(status)
148+
if not os.path.exists(file_path):
149+
os.makedirs(file_path)
115150

116-
filename = "ci-lock/" + ticket_number
117-
with open(filename, "w") as file:
118-
file.write(ticket_number)
119-
boto3.Session().resource("s3").Object(bucket_name, filename).upload_file(filename)
151+
file_full_path = file_path + "/" + filename
152+
with open(file_full_path, "w") as file:
153+
file.write(filename)
154+
s3_file_obj = boto3.Session().resource("s3").Object(bucket_name, file_full_path)
155+
s3_file_obj.upload_file(file_full_path)
156+
print("Build %s is now in state %s" % (filename, status))
157+
return s3_file_obj
120158

121159

122160
if __name__ == "__main__":

src/sagemaker/fw_utils.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,20 @@
5959
"local_gpu",
6060
)
6161
SM_DATAPARALLEL_SUPPORTED_FRAMEWORK_VERSIONS = {
62-
"tensorflow": ["2.3", "2.3.1", "2.3.2", "2.4", "2.4.1", "2.4.3", "2.5", "2.5.0", "2.5.1"],
62+
"tensorflow": [
63+
"2.3",
64+
"2.3.1",
65+
"2.3.2",
66+
"2.4",
67+
"2.4.1",
68+
"2.4.3",
69+
"2.5",
70+
"2.5.0",
71+
"2.5.1",
72+
"2.6",
73+
"2.6.0",
74+
"2.6.2",
75+
],
6376
"pytorch": ["1.6", "1.6.0", "1.7", "1.7.1", "1.8", "1.8.0", "1.8.1", "1.9", "1.9.0", "1.9.1"],
6477
}
6578
SMDISTRIBUTED_SUPPORTED_STRATEGIES = ["dataparallel", "modelparallel"]

src/sagemaker/image_uri_config/tensorflow.json

+100-2
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,8 @@
278278
"2.2": "2.2.2",
279279
"2.3": "2.3.2",
280280
"2.4": "2.4.3",
281-
"2.5": "2.5.1"
281+
"2.5": "2.5.1",
282+
"2.6": "2.6.0"
282283
},
283284
"versions": {
284285
"1.10.0": {
@@ -1312,6 +1313,36 @@
13121313
"us-west-2": "763104351884"
13131314
},
13141315
"repository": "tensorflow-inference"
1316+
},
1317+
"2.6.0": {
1318+
"registries": {
1319+
"af-south-1": "626614931356",
1320+
"ap-east-1": "871362719292",
1321+
"ap-northeast-1": "763104351884",
1322+
"ap-northeast-2": "763104351884",
1323+
"ap-northeast-3": "364406365360",
1324+
"ap-south-1": "763104351884",
1325+
"ap-southeast-1": "763104351884",
1326+
"ap-southeast-2": "763104351884",
1327+
"ca-central-1": "763104351884",
1328+
"cn-north-1": "727897471807",
1329+
"cn-northwest-1": "727897471807",
1330+
"eu-central-1": "763104351884",
1331+
"eu-north-1": "763104351884",
1332+
"eu-south-1": "692866216735",
1333+
"eu-west-1": "763104351884",
1334+
"eu-west-2": "763104351884",
1335+
"eu-west-3": "763104351884",
1336+
"me-south-1": "217643126080",
1337+
"sa-east-1": "763104351884",
1338+
"us-east-1": "763104351884",
1339+
"us-east-2": "763104351884",
1340+
"us-gov-west-1": "442386744353",
1341+
"us-iso-east-1": "886529160074",
1342+
"us-west-1": "763104351884",
1343+
"us-west-2": "763104351884"
1344+
},
1345+
"repository": "tensorflow-inference"
13151346
}
13161347
}
13171348
},
@@ -1338,7 +1369,8 @@
13381369
"2.2": "2.2.2",
13391370
"2.3": "2.3.2",
13401371
"2.4": "2.4.3",
1341-
"2.5": "2.5.1"
1372+
"2.5": "2.5.1",
1373+
"2.6": "2.6.2"
13421374
},
13431375
"versions": {
13441376
"1.10.0": {
@@ -2531,6 +2563,72 @@
25312563
"us-west-2": "763104351884"
25322564
},
25332565
"repository": "tensorflow-training"
2566+
},
2567+
"2.6.0": {
2568+
"py_versions": [
2569+
"py38"
2570+
],
2571+
"registries": {
2572+
"af-south-1": "626614931356",
2573+
"ap-east-1": "871362719292",
2574+
"ap-northeast-1": "763104351884",
2575+
"ap-northeast-2": "763104351884",
2576+
"ap-northeast-3": "364406365360",
2577+
"ap-south-1": "763104351884",
2578+
"ap-southeast-1": "763104351884",
2579+
"ap-southeast-2": "763104351884",
2580+
"ca-central-1": "763104351884",
2581+
"cn-north-1": "727897471807",
2582+
"cn-northwest-1": "727897471807",
2583+
"eu-central-1": "763104351884",
2584+
"eu-north-1": "763104351884",
2585+
"eu-south-1": "692866216735",
2586+
"eu-west-1": "763104351884",
2587+
"eu-west-2": "763104351884",
2588+
"eu-west-3": "763104351884",
2589+
"me-south-1": "217643126080",
2590+
"sa-east-1": "763104351884",
2591+
"us-east-1": "763104351884",
2592+
"us-east-2": "763104351884",
2593+
"us-gov-west-1": "442386744353",
2594+
"us-iso-east-1": "886529160074",
2595+
"us-west-1": "763104351884",
2596+
"us-west-2": "763104351884"
2597+
},
2598+
"repository": "tensorflow-training"
2599+
},
2600+
"2.6.2": {
2601+
"py_versions": [
2602+
"py38"
2603+
],
2604+
"registries": {
2605+
"af-south-1": "626614931356",
2606+
"ap-east-1": "871362719292",
2607+
"ap-northeast-1": "763104351884",
2608+
"ap-northeast-2": "763104351884",
2609+
"ap-northeast-3": "364406365360",
2610+
"ap-south-1": "763104351884",
2611+
"ap-southeast-1": "763104351884",
2612+
"ap-southeast-2": "763104351884",
2613+
"ca-central-1": "763104351884",
2614+
"cn-north-1": "727897471807",
2615+
"cn-northwest-1": "727897471807",
2616+
"eu-central-1": "763104351884",
2617+
"eu-north-1": "763104351884",
2618+
"eu-south-1": "692866216735",
2619+
"eu-west-1": "763104351884",
2620+
"eu-west-2": "763104351884",
2621+
"eu-west-3": "763104351884",
2622+
"me-south-1": "217643126080",
2623+
"sa-east-1": "763104351884",
2624+
"us-east-1": "763104351884",
2625+
"us-east-2": "763104351884",
2626+
"us-gov-west-1": "442386744353",
2627+
"us-iso-east-1": "886529160074",
2628+
"us-west-1": "763104351884",
2629+
"us-west-2": "763104351884"
2630+
},
2631+
"repository": "tensorflow-training"
25342632
}
25352633
}
25362634
}

0 commit comments

Comments
 (0)