feature: Add support for JSON model inputs for Clarify Processor (#3768)

spoorn · Michael Trinh · web-flow · commit 4d1f9015bfc9 · 2023-04-10T20:49:07.000-07:00
Co-authored-by: Michael Trinh &lt;mttrinh@amazon.com&gt;
diff --git a/src/sagemaker/clarify.py b/src/sagemaker/clarify.py
@@ -282,6 +282,7 @@
                 in (
                     "text/csv",
                     "application/jsonlines",
+                    "application/json",
                     "image/jpeg",
                     "image/png",
                     "application/x-npy",
@@ -296,6 +297,7 @@
             SchemaOptional("probability"): Or(str, int),
             SchemaOptional("label_headers"): [Or(str, int)],
             SchemaOptional("content_template"): Or(str, {str: str}),
+            SchemaOptional("record_template"): str,
             SchemaOptional("custom_attributes"): str,
         },
     }
@@ -573,6 +575,7 @@ def __init__(
         accept_type: Optional[str] = None,
         content_type: Optional[str] = None,
         content_template: Optional[str] = None,
+        record_template: Optional[str] = None,
         custom_attributes: Optional[str] = None,
         accelerator_type: Optional[str] = None,
         endpoint_name_prefix: Optional[str] = None,
@@ -599,14 +602,80 @@ def __init__(
                 ``"application/jsonlines"`` for JSON Lines, and ``"application/json"`` for JSON.
                 Default is the same as ``content_type``.
             content_type (str): The model input format to be used for getting inferences with the
-                shadow endpoint. Valid values are ``"text/csv"`` for CSV and
-                ``"application/jsonlines"`` for JSON Lines. Default is the same as
-                ``dataset_format``.
+                shadow endpoint. Valid values are ``"text/csv"`` for CSV,
+                ``"application/jsonlines"`` for JSON Lines, and ``"application/json"`` for JSON.
+                Default is the same as ``dataset_format``.
             content_template (str): A template string to be used to construct the model input from
-                dataset instances. It is only used when ``model_content_type`` is
-                ``"application/jsonlines"``. The template should have one and only one placeholder,
-                ``"features"``, which will be replaced by a features list to form the model
-                inference input.
+                dataset instances. It is only used, and required, when ``model_content_type`` is
+                ``"application/jsonlines"`` or ``"application/json"``. When ``model_content_type``
+                is ``application/jsonlines``, the template should have one and only one
+                placeholder, ``$features``, which will be replaced by a features list for each
+                record to form the model inference input.  When ``model_content_type`` is
+                ``application/json``, the template can have either placeholder ``$record``, which
+                will be replaced by a single record templated by ``record_template`` and only a
+                single record at a time will be sent to the model, or placeholder ``$records``,
+                which will be replaced by a list of records, each templated by ``record_template``.
+            record_template (str): A template string to be used to construct each record of the
+                model input from dataset instances.  It is only used, and required, when
+                ``model_content_type`` is ``"application/json"``.
+                The template string may contain one of the following:
+
+                * Placeholder ``$features`` that will be substituted by the array of feature values
+                  and/or an optional placeholder ``$feature_names`` that will be substituted by the
+                  array of feature names.
+                * Exactly one placeholder ``$features_kvp`` that will be substituted by the
+                  key-value pairs of feature name and feature value.
+                * Or for each feature, if "A" is the feature name in the ``headers`` configuration,
+                  then placeholder syntax ``"${A}"`` (the double-quotes are part of the
+                  placeholder) will be substituted by the feature value.
+
+                ``record_template`` will be used in conjunction with ``content_template`` to
+                construct the model input.
+
+                **Examples:**
+
+                Given:
+
+                * ``headers``: ``["A", "B"]``
+                * ``features``: ``[[0, 1], [3, 4]]``
+
+                Example model input 1::
+
+                    {
+                        "instances": [[0, 1], [3, 4]],
+                        "feature_names": ["A", "B"]
+                    }
+
+                content_template and record_template to construct above:
+
+                * ``content_template``: ``"{\"instances\": $records}"``
+                * ``record_template``: ``"$features"``
+
+                Example model input 2::
+
+                    [
+                        { "A": 0, "B": 1 },
+                        { "A": 3, "B": 4 },
+                    ]
+
+                content_template and record_template to construct above:
+
+                * ``content_template``: ``"$records"``
+                * ``record_template``: ``"$features_kvp"``
+
+                Or, alternatively:
+
+                * ``content_template``: ``"$records"``
+                * ``record_template``: ``"{\"A\": \"${A}\", \"B\": \"${B}\"}"``
+
+                Example model input 3 (single record only)::
+
+                    { "A": 0, "B": 1 }
+
+                content_template and record_template to construct above:
+
+                * ``content_template``: ``"$record"``
+                * ``record_template``: ``"$features_kvp"``
             custom_attributes (str): Provides additional information about a request for an
                 inference submitted to a model hosted at an Amazon SageMaker endpoint. The
                 information is an opaque value that is forwarded verbatim. You could use this
@@ -677,6 +746,7 @@ def __init__(
             if content_type not in [
                 "text/csv",
                 "application/jsonlines",
+                "application/json",
                 "image/jpeg",
                 "image/jpg",
                 "image/png",
@@ -686,14 +756,32 @@ def __init__(
                     f"Invalid content_type {content_type}."
                     f" Please choose text/csv or application/jsonlines."
                 )
+            if content_type == "application/jsonlines":
+                if content_template is None:
+                    raise ValueError(
+                        f"content_template field is required for content_type {content_type}"
+                    )
+                if "$features" not in content_template:
+                    raise ValueError(
+                        f"Invalid content_template {content_template}."
+                        f" Please include a placeholder $features."
+                    )
+            if content_type == "application/json":
+                if content_template is None or record_template is None:
+                    raise ValueError(
+                        f"content_template and record_template are required for content_type "
+                        f"{content_type}"
+                    )
+                if "$record" not in content_template:
+                    raise ValueError(
+                        f"Invalid content_template {content_template}."
+                        f" Please include either placeholder $records or $record."
+                    )
             self.predictor_config["content_type"] = content_type
         if content_template is not None:
-            if "$features" not in content_template:
-                raise ValueError(
-                    f"Invalid content_template {content_template}."
-                    f" Please include a placeholder $features."
-                )
             self.predictor_config["content_template"] = content_template
+        if record_template is not None:
+            self.predictor_config["record_template"] = record_template
         _set(custom_attributes, "custom_attributes", self.predictor_config)
         _set(accelerator_type, "accelerator_type", self.predictor_config)
         _set(target_model, "target_model", self.predictor_config)
diff --git a/tests/unit/sagemaker/monitor/test_clarify_model_monitor.py b/tests/unit/sagemaker/monitor/test_clarify_model_monitor.py
@@ -365,6 +365,7 @@
 MODEL_NAME = "xgboost-model"
 ACCEPT_TYPE = "text/csv"
 CONTENT_TYPE = "application/jsonlines"
+JSONLINES_CONTENT_TEMPLATE = '{"instances":$features}'
 EXPLAINABILITY_ANALYSIS_CONFIG = {
     "headers": ANALYSIS_CONFIG_HEADERS_OF_FEATURES,
     "methods": {
@@ -382,6 +383,7 @@
         "initial_instance_count": INSTANCE_COUNT,
         "accept_type": ACCEPT_TYPE,
         "content_type": CONTENT_TYPE,
+        "content_template": JSONLINES_CONTENT_TEMPLATE,
     },
 }
 EXPLAINABILITY_ANALYSIS_CONFIG_WITH_LABEL_HEADERS = copy.deepcopy(EXPLAINABILITY_ANALYSIS_CONFIG)
@@ -489,6 +491,7 @@ def model_config():
         instance_count=INSTANCE_COUNT,
         content_type=CONTENT_TYPE,
         accept_type=ACCEPT_TYPE,
+        content_template=JSONLINES_CONTENT_TEMPLATE,
     )
 
 
diff --git a/tests/unit/test_clarify.py b/tests/unit/test_clarify.py
@@ -393,6 +393,9 @@ def test_facet_of_bias_config(facet_name, facet_values_or_threshold, expected_re
         ("text/csv", "application/json"),
         ("application/jsonlines", "application/json"),
         ("application/jsonlines", "text/csv"),
+        ("application/json", "application/json"),
+        ("application/json", "application/jsonlines"),
+        ("application/json", "text/csv"),
         ("image/jpeg", "text/csv"),
         ("image/jpg", "text/csv"),
         ("image/png", "text/csv"),
@@ -406,12 +409,22 @@ def test_valid_model_config(content_type, accept_type):
     custom_attributes = "c000b4f9-df62-4c85-a0bf-7c525f9104a4"
     target_model = "target_model_name"
     accelerator_type = "ml.eia1.medium"
+    content_template = (
+        '{"instances":$features}'
+        if content_type == "application/jsonlines"
+        else "$records"
+        if content_type == "application/json"
+        else None
+    )
+    record_template = "$features_kvp" if content_type == "application/json" else None
     model_config = ModelConfig(
         model_name=model_name,
         instance_type=instance_type,
         instance_count=instance_count,
         accept_type=accept_type,
         content_type=content_type,
+        content_template=content_template,
+        record_template=record_template,
         custom_attributes=custom_attributes,
         accelerator_type=accelerator_type,
         target_model=target_model,
@@ -426,21 +439,79 @@ def test_valid_model_config(content_type, accept_type):
         "accelerator_type": accelerator_type,
         "target_model": target_model,
     }
+    if content_template is not None:
+        expected_config["content_template"] = content_template
+    if record_template is not None:
+        expected_config["record_template"] = record_template
     assert expected_config == model_config.get_predictor_config()
 
 
-def test_invalid_model_config():
-    with pytest.raises(ValueError) as error:
+@pytest.mark.parametrize(
+    ("error", "content_type", "accept_type", "content_template", "record_template"),
+    [
+        (
+            "Invalid accept_type invalid_accept_type. Please choose text/csv or application/jsonlines.",
+            "text/csv",
+            "invalid_accept_type",
+            None,
+            None,
+        ),
+        (
+            "Invalid content_type invalid_content_type. Please choose text/csv or application/jsonlines.",
+            "invalid_content_type",
+            "text/csv",
+            None,
+            None,
+        ),
+        (
+            "content_template field is required for content_type",
+            "application/jsonlines",
+            "text/csv",
+            None,
+            None,
+        ),
+        (
+            "content_template and record_template are required for content_type",
+            "application/json",
+            "text/csv",
+            None,
+            None,
+        ),
+        (
+            "content_template and record_template are required for content_type",
+            "application/json",
+            "text/csv",
+            "$records",
+            None,
+        ),
+        (
+            r"Invalid content_template invalid_content_template. Please include a placeholder \$features.",
+            "application/jsonlines",
+            "text/csv",
+            "invalid_content_template",
+            None,
+        ),
+        (
+            r"Invalid content_template invalid_content_template. Please include either placeholder "
+            r"\$records or \$record.",
+            "application/json",
+            "text/csv",
+            "invalid_content_template",
+            "$features",
+        ),
+    ],
+)
+def test_invalid_model_config(error, content_type, accept_type, content_template, record_template):
+    with pytest.raises(ValueError, match=error):
         ModelConfig(
             model_name="xgboost-model",
             instance_type="ml.c5.xlarge",
             instance_count=1,
-            accept_type="invalid_accept_type",
+            content_type=content_type,
+            accept_type=accept_type,
+            content_template=content_template,
+            record_template=record_template,
         )
-    assert (
-        "Invalid accept_type invalid_accept_type. Please choose text/csv or application/jsonlines."
-        in str(error.value)
-    )
 
 
 def test_invalid_model_config_with_bad_endpoint_name_prefix():