From 1aaaf28fb0c52a1e8928d8a777e7b0284027a027 Mon Sep 17 00:00:00 2001 From: Benjamin Crabtree Date: Wed, 13 Apr 2022 18:44:14 +0000 Subject: [PATCH 1/4] feat: add example notebooks to jumpstart documentation --- doc/overview.rst | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/doc/overview.rst b/doc/overview.rst index df340338d9..69496674aa 100644 --- a/doc/overview.rst +++ b/doc/overview.rst @@ -573,15 +573,49 @@ Here is an example: # When you are done using your endpoint model.sagemaker_session.delete_endpoint('my-endpoint') -******************************************** -Use Prebuilt Models with SageMaker JumpStart -******************************************** +********************************************************* +Use SageMaker JumpStart Algorithms with Pretrained Models +********************************************************* + +JumpStart for the SageMaker Python SDK uses model ids and model versions to access the necessary +utilities. This table serves to provide the core material plus some extra information that can be useful +in selecting the correct model id and corresponding parameters. .. toctree:: :maxdepth: 2 doc_utils/jumpstart +Example notebooks +================= + +JumpStart supports 15 different machine learning problem types. Below is a list of all the supported +problem types with a link to a Jupyter notebook that provides example usage. + +Vision + - `Image Classification `__ + - `Object Detection `__ + - `Semantic Segmentation `__ + - `Instance Segmentation `__ + - `Image Embedding `__ + +Text + - `Text Classification `__ + - `Sentence Pair Classification `__ + - `Question Answering `__ + - `Named Entity Recognition `__ + - `Text Summarization `__ + - `Text Generation `__ + - `Machine Translation `__ + - `Text Embedding `__ + +Tabular + - `Tabular Classification (LightGBM & Catboost) `__ + - `Tabular Classification (XGBoost & Linear Learner) `__ + - `Tabular Regression (LightGBM & Catboost) `__ + - `Tabular Regression (XGBoost & Linear Learner) `__ + + `Amazon SageMaker JumpStart `__ is a SageMaker feature that helps users bring machine learning (ML) applications to market using prebuilt solutions for common use cases, From 95b10565ec351b648672482996f903b816c0a7bd Mon Sep 17 00:00:00 2001 From: Benjamin Crabtree Date: Wed, 13 Apr 2022 19:13:29 +0000 Subject: [PATCH 2/4] chore: remove trailing whitespaces --- doc/overview.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/overview.rst b/doc/overview.rst index 69496674aa..bd3f1fd147 100644 --- a/doc/overview.rst +++ b/doc/overview.rst @@ -598,7 +598,7 @@ Vision - `Semantic Segmentation `__ - `Instance Segmentation `__ - `Image Embedding `__ - + Text - `Text Classification `__ - `Sentence Pair Classification `__ @@ -608,7 +608,7 @@ Text - `Text Generation `__ - `Machine Translation `__ - `Text Embedding `__ - + Tabular - `Tabular Classification (LightGBM & Catboost) `__ - `Tabular Classification (XGBoost & Linear Learner) `__ From 4d82bdc9ca65b2b337aa410a1f11e94a4141da08 Mon Sep 17 00:00:00 2001 From: Benjamin Crabtree Date: Fri, 15 Apr 2022 00:14:16 +0000 Subject: [PATCH 3/4] chore: update source column --- doc/doc_utils/jumpstart_doc_utils.py | 73 ++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 4 deletions(-) diff --git a/doc/doc_utils/jumpstart_doc_utils.py b/doc/doc_utils/jumpstart_doc_utils.py index 47cf6e5f39..ba3ced2473 100644 --- a/doc/doc_utils/jumpstart_doc_utils.py +++ b/doc/doc_utils/jumpstart_doc_utils.py @@ -13,13 +13,71 @@ from __future__ import absolute_import from urllib import request import json -from packaging.version import Version +from packaging.version import Version +from enum import Enum + +class Tasks(str, Enum): + """The ML task name as referenced in the infix of the model ID.""" + + IC = "ic" + OD = "od" + OD1 = "od1" + SEMSEG = "semseg" + IS = "is" + TC = "tc" + SPC = "spc" + EQA = "eqa" + TEXT_GENERATION = "textgeneration" + IC_EMBEDDING = "icembedding" + TC_EMBEDDING = "tcembedding" + NER = "ner" + SUMMARIZATION = "summarization" + TRANSLATION = "translation" + TABULAR_REGRESSION = "regression" + TABULAR_CLASSIFICATION = "classification" + +class ProblemTypes(str, Enum): + """Possible problem types for JumpStart models.""" + + IMAGE_CLASSIFICATION = "Image Classification" + IMAGE_EMBEDDING = "Image Embedding" + OBJECT_DETECTION = "Object Detection" + SEMANTIC_SEGMENTATION = "Semantic Segmentation" + INSTANCE_SEGMENTATION = "Instance Segmentation" + TEXT_CLASSIFICATION = "Text Classification" + TEXT_EMBEDDING = "Text Embedding" + QUESTION_ANSWERING = "Question Answering" + SENTENCE_PAIR_CLASSIFICATION = "Sentence Pair Classification" + TEXT_GENERATION = "Text Generation" + TEXT_SUMMARIZATION = "Text Summarization" + MACHINE_TRANSLATION = "Machine Translation" + NAMED_ENTITY_RECOGNITION = "Named Entity Recognition" + TABULAR_REGRESSION = "Regression" + TABULAR_CLASSIFICATION = "Classification" JUMPSTART_REGION = "eu-west-2" SDK_MANIFEST_FILE = "models_manifest.json" JUMPSTART_BUCKET_BASE_URL = "https://jumpstart-cache-prod-{}.s3.{}.amazonaws.com".format( JUMPSTART_REGION, JUMPSTART_REGION ) +TASK_MAP = { + Tasks.IC: ProblemTypes.IMAGE_CLASSIFICATION, + Tasks.IC_EMBEDDING: ProblemTypes.IMAGE_EMBEDDING, + Tasks.OD: ProblemTypes.OBJECT_DETECTION, + Tasks.OD1: ProblemTypes.OBJECT_DETECTION, + Tasks.SEMSEG: ProblemTypes.SEMANTIC_SEGMENTATION, + Tasks.IS: ProblemTypes.INSTANCE_SEGMENTATION, + Tasks.TC: ProblemTypes.TEXT_CLASSIFICATION, + Tasks.TC_EMBEDDING: ProblemTypes.TEXT_EMBEDDING, + Tasks.EQA: ProblemTypes.QUESTION_ANSWERING, + Tasks.SPC: ProblemTypes.SENTENCE_PAIR_CLASSIFICATION, + Tasks.TEXT_GENERATION: ProblemTypes.TEXT_GENERATION, + Tasks.SUMMARIZATION: ProblemTypes.TEXT_SUMMARIZATION, + Tasks.TRANSLATION: ProblemTypes.MACHINE_TRANSLATION, + Tasks.NER: ProblemTypes.NAMED_ENTITY_RECOGNITION, + Tasks.TABULAR_REGRESSION: ProblemTypes.TABULAR_REGRESSION, + Tasks.TABULAR_CLASSIFICATION: ProblemTypes.TABULAR_CLASSIFICATION, +} def get_jumpstart_sdk_manifest(): @@ -35,6 +93,10 @@ def get_jumpstart_sdk_spec(key): model_spec = f.read().decode("utf-8") return json.loads(model_spec) +def get_model_task(id): + task_short = id.split('-')[1] + return TASK_MAP[task_short] if task_short in TASK_MAP else 'Source' + def create_jumpstart_model_table(): sdk_manifest = get_jumpstart_sdk_manifest() @@ -69,12 +131,12 @@ def create_jumpstart_model_table(): ) file_content.append( """ - Each model id is linked to an external page that describes the model.\n + Click on the Problem Type to navigate to the source of the model.\n """ ) file_content.append("\n") file_content.append(".. list-table:: Available Models\n") - file_content.append(" :widths: 50 20 20 20\n") + file_content.append(" :widths: 50 20 20 20 30\n") file_content.append(" :header-rows: 1\n") file_content.append(" :class: datatable\n") file_content.append("\n") @@ -82,13 +144,16 @@ def create_jumpstart_model_table(): file_content.append(" - Fine Tunable?\n") file_content.append(" - Latest Version\n") file_content.append(" - Min SDK Version\n") + file_content.append(" - Problem Type/Source\n") for model in sdk_manifest_top_versions_for_models.values(): model_spec = get_jumpstart_sdk_spec(model["spec_key"]) - file_content.append(" * - `{} <{}>`_\n".format(model_spec["model_id"], model_spec["url"])) + model_task = get_model_task(model_spec["model_id"]) + file_content.append(" * - {}\n".format(model_spec["model_id"])) file_content.append(" - {}\n".format(model_spec["training_supported"])) file_content.append(" - {}\n".format(model["version"])) file_content.append(" - {}\n".format(model["min_version"])) + file_content.append(" - `{} <{}>`__\n".format(model_task, model_spec["url"])) f = open("doc_utils/jumpstart.rst", "w") f.writelines(file_content) From d425ff9e12076673513c25f6e56506f266ea60a2 Mon Sep 17 00:00:00 2001 From: Benjamin Crabtree Date: Fri, 15 Apr 2022 00:31:28 +0000 Subject: [PATCH 4/4] chore: update styles --- doc/doc_utils/jumpstart_doc_utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/doc/doc_utils/jumpstart_doc_utils.py b/doc/doc_utils/jumpstart_doc_utils.py index ba3ced2473..b692715706 100644 --- a/doc/doc_utils/jumpstart_doc_utils.py +++ b/doc/doc_utils/jumpstart_doc_utils.py @@ -13,9 +13,10 @@ from __future__ import absolute_import from urllib import request import json -from packaging.version import Version +from packaging.version import Version from enum import Enum + class Tasks(str, Enum): """The ML task name as referenced in the infix of the model ID.""" @@ -36,6 +37,7 @@ class Tasks(str, Enum): TABULAR_REGRESSION = "regression" TABULAR_CLASSIFICATION = "classification" + class ProblemTypes(str, Enum): """Possible problem types for JumpStart models.""" @@ -55,6 +57,7 @@ class ProblemTypes(str, Enum): TABULAR_REGRESSION = "Regression" TABULAR_CLASSIFICATION = "Classification" + JUMPSTART_REGION = "eu-west-2" SDK_MANIFEST_FILE = "models_manifest.json" JUMPSTART_BUCKET_BASE_URL = "https://jumpstart-cache-prod-{}.s3.{}.amazonaws.com".format( @@ -93,9 +96,10 @@ def get_jumpstart_sdk_spec(key): model_spec = f.read().decode("utf-8") return json.loads(model_spec) + def get_model_task(id): - task_short = id.split('-')[1] - return TASK_MAP[task_short] if task_short in TASK_MAP else 'Source' + task_short = id.split("-")[1] + return TASK_MAP[task_short] if task_short in TASK_MAP else "Source" def create_jumpstart_model_table():