feat: added ANALYSIS_CONFIG_SCHEMA_V1_0 in clarify

aws-byeldos · aws-byeldos · commit 2d122b7b5c17 · 2022-08-23T16:04:35.000+02:00
diff --git a/setup.py b/setup.py
@@ -58,6 +58,7 @@ def read_requirements(filename):
     "packaging>=20.0",
     "pandas",
     "pathos",
+    "schema",
 ]
 
 # Specific use case dependencies
diff --git a/src/sagemaker/clarify.py b/src/sagemaker/clarify.py
@@ -27,12 +27,263 @@
 from abc import ABC, abstractmethod
 from typing import List, Union, Dict
 
+from schema import Schema, And, Use, Or, Optional, Regex
+
 from sagemaker import image_uris, s3, utils
 from sagemaker.processing import ProcessingInput, ProcessingOutput, Processor
 
 logger = logging.getLogger(__name__)
 
 
+ENDPOINT_NAME_PREFIX_PATTERN = "^[a-zA-Z0-9](-*[a-zA-Z0-9])"
+
+
+ANALYSIS_CONFIG_SCHEMA_V1_0 = Schema(
+    {
+        Optional("version"): str,
+        "dataset_type": And(
+            str,
+            Use(str.lower),
+            lambda s: s
+            in (
+                "text/csv",
+                "application/jsonlines",
+                "application/sagemakercapturejson",
+                "application/x-parquet",
+                "application/x-image",
+            ),
+        ),
+        Optional("dataset_uri"): str,
+        Optional("headers"): [str],
+        Optional("label"): Or(str, int),
+        # this field indicates user provides predicted_label in dataset
+        Optional("predicted_label"): Or(str, int),
+        Optional("features"): str,
+        Optional("label_values_or_threshold"): [Or(int, float, str)],
+        Optional("probability_threshold"): float,
+        Optional("facet"): [{"name_or_index": Or(str, int), Optional("value_or_threshold"): [Or(int, float, str)]}],
+        Optional("facet_dataset_uri"): str,
+        Optional("facet_headers"): [str],
+        Optional("predicted_label_dataset_uri"): str,
+        Optional("predicted_label_headers"): [str],
+        Optional("excluded_columns"): [Or(int, str)],
+        Optional("joinsource_name_or_index"): Or(str, int),
+        Optional("group_variable"): Or(str, int),
+        "methods": {
+            Optional("shap"): {
+                Optional("baseline"): Or(
+                    # URI of the baseline data file
+                    str,
+                    # Inplace baseline data (a list of something)
+                    [
+                        Or(
+                            # CSV row
+                            [Or(int, float, str, None)],
+                            # JSON row (any JSON object). As I write this only SageMaker JSONLines Dense Format ([1])
+                            # is supported and the validation is NOT done by the schema but by the data loader.
+                            # [1] https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html#cm-jsonlines
+                            {object: object},
+                        )
+                    ],
+                ),
+                Optional("num_clusters"): int,
+                Optional("use_logit"): bool,
+                Optional("num_samples"): int,
+                Optional("agg_method"): And(str, Use(str.lower), lambda s: s in ("mean_abs", "median", "mean_sq")),
+                Optional("save_local_shap_values"): bool,
+                Optional("text_config"): {
+                    "granularity": And(str, Use(str.lower), lambda s: s in ("token", "sentence", "paragraph")),
+                    "language": And(
+                        str,
+                        Use(str.lower),
+                        lambda s: s
+                        in (
+                            "chinese",
+                            "zh",
+                            "danish",
+                            "da",
+                            "dutch",
+                            "nl",
+                            "english",
+                            "en",
+                            "french",
+                            "fr",
+                            "german",
+                            "de",
+                            "greek",
+                            "el",
+                            "italian",
+                            "it",
+                            "japanese",
+                            "ja",
+                            "lithuanian",
+                            "lt",
+                            "multi-language",
+                            "xx",
+                            "norwegian bokmål",
+                            "nb",
+                            "polish",
+                            "pl",
+                            "portuguese",
+                            "pt",
+                            "romanian",
+                            "ro",
+                            "russian",
+                            "ru",
+                            "spanish",
+                            "es",
+                            "afrikaans",
+                            "af",
+                            "albanian",
+                            "sq",
+                            "arabic",
+                            "ar",
+                            "armenian",
+                            "hy",
+                            "basque",
+                            "eu",
+                            "bengali",
+                            "bn",
+                            "bulgarian",
+                            "bg",
+                            "catalan",
+                            "ca",
+                            "croatian",
+                            "hr",
+                            "czech",
+                            "cs",
+                            "estonian",
+                            "et",
+                            "finnish",
+                            "fi",
+                            "gujarati",
+                            "gu",
+                            "hebrew",
+                            "he",
+                            "hindi",
+                            "hi",
+                            "hungarian",
+                            "hu",
+                            "icelandic",
+                            "is",
+                            "indonesian",
+                            "id",
+                            "irish",
+                            "ga",
+                            "kannada",
+                            "kn",
+                            "kyrgyz",
+                            "ky",
+                            "latvian",
+                            "lv",
+                            "ligurian",
+                            "lij",
+                            "luxembourgish",
+                            "lb",
+                            "macedonian",
+                            "mk",
+                            "malayalam",
+                            "ml",
+                            "marathi",
+                            "mr",
+                            "nepali",
+                            "ne",
+                            "persian",
+                            "fa",
+                            "sanskrit",
+                            "sa",
+                            "serbian",
+                            "sr",
+                            "setswana",
+                            "tn",
+                            "sinhala",
+                            "si",
+                            "slovak",
+                            "sk",
+                            "slovenian",
+                            "sl",
+                            "swedish",
+                            "sv",
+                            "tagalog",
+                            "tl",
+                            "tamil",
+                            "ta",
+                            "tatar",
+                            "tt",
+                            "telugu",
+                            "te",
+                            "thai",
+                            "th",
+                            "turkish",
+                            "tr",
+                            "ukrainian",
+                            "uk",
+                            "urdu",
+                            "ur",
+                            "vietnamese",
+                            "vi",
+                            "yoruba",
+                            "yo",
+                        ),
+                    ),
+                    Optional("max_top_tokens"): int,
+                },
+                Optional("image_config"): {
+                    Optional("num_segments"): int,
+                    Optional("segment_compactness"): int,
+                    Optional("feature_extraction_method"): str,
+                    Optional("model_type"): str,
+                    Optional("max_objects"): int,
+                    Optional("iou_threshold"): float,
+                    Optional("context"): float,
+                    Optional("debug"): {
+                        Optional("image_names"): [str],
+                        Optional("class_ids"): [int],
+                        Optional("sample_from"): int,
+                        Optional("sample_to"): int,
+                    },
+                },
+                Optional("seed"): int,
+            },
+            Optional("pre_training_bias"): {"methods": Or(str, [str])},
+            Optional("post_training_bias"): {"methods": Or(str, [str])},
+            Optional("pdp"): {
+                "grid_resolution": int,
+                Optional("features"): [Or(str, int)],
+                Optional("top_k_features"): int,
+            },
+            Optional("report"): {"name": str, Optional("title"): str},
+        },
+        Optional("predictor"): {
+            Optional("endpoint_name"): str,
+            Optional("endpoint_name_prefix"): And(
+                str, Regex(ENDPOINT_NAME_PREFIX_PATTERN)
+            ),
+            Optional("model_name"): str,
+            Optional("target_model"): str,
+            Optional("instance_type"): str,
+            Optional("initial_instance_count"): int,
+            Optional("accelerator_type"): str,
+            Optional("content_type"): And(
+                str,
+                Use(str.lower),
+                lambda s: s
+                in ("text/csv", "application/jsonlines", "image/jpeg", "image/jpg", "image/png", "application/x-npy"),
+            ),
+            Optional("accept_type"): And(
+                str, Use(str.lower), lambda s: s in ("text/csv", "application/jsonlines", "application/json")
+            ),
+            Optional("label"): Or(str, int),
+            Optional("probability"): Or(str, int),
+            Optional("label_headers"): [Or(str, int)],
+            Optional("content_template"): Or(str, {str: str}),
+            Optional("custom_attributes"): str,
+        },
+        Optional("local_predictor"): {"python_module": str, "class": str, Optional("args"): [str]},
+    }
+)
+
+
 class DataConfig:
     """Config object related to configurations of the input and output dataset."""
 
@@ -1030,6 +1281,7 @@ def _run(
         # for debugging: to access locally, i.e. without a need to look for it in an S3 bucket
         self._last_analysis_config = analysis_config
         logger.info("Analysis Config: %s", analysis_config)
+        ANALYSIS_CONFIG_SCHEMA_V1_0.validate(analysis_config)
 
         with tempfile.TemporaryDirectory() as tmpdirname:
             analysis_config_file = os.path.join(tmpdirname, "analysis_config.json")
diff --git a/tests/unit/test_clarify.py b/tests/unit/test_clarify.py
@@ -914,6 +914,7 @@ def test_run_on_s3_analysis_config_file(
     processor_run, sagemaker_session, clarify_processor, data_config
 ):
     analysis_config = {
+        "dataset_type": "text/csv",
         "methods": {"post_training_bias": {"methods": "all"}},
     }
     with patch("sagemaker.clarify._upload_analysis_config", return_value=None) as mock_method:

Original file line number	Diff line number	Diff line change
`@@ -58,6 +58,7 @@ def read_requirements(filename):`
`58`	`58`	`"packaging>=20.0",`
`59`	`59`	`"pandas",`
`60`	`60`	`"pathos",`
	`61`	`+ "schema",`
`61`	`62`	`]`
`62`	`63`
`63`	`64`	`# Specific use case dependencies`
Original file line number	Diff line number	Diff line change
`@@ -914,6 +914,7 @@ def test_run_on_s3_analysis_config_file(`
`914`	`914`	`processor_run, sagemaker_session, clarify_processor, data_config`
`915`	`915`	`):`
`916`	`916`	`analysis_config = {`
	`917`	`+ "dataset_type": "text/csv",`
`917`	`918`	`"methods": {"post_training_bias": {"methods": "all"}},`
`918`	`919`	`}`
`919`	`920`	`with patch("sagemaker.clarify._upload_analysis_config", return_value=None) as mock_method:`