Support evaluating on endpoint without type (#13226)

B-Step62 · web-flow · commit 24a381d3013e · 2024-09-25T15:13:57.000+09:00
Signed-off-by: B-Step62 &lt;yuki.watanabe@databricks.com&gt;
diff --git a/mlflow/metrics/genai/model_utils.py b/mlflow/metrics/genai/model_utils.py
@@ -113,6 +113,14 @@ def _call_openai_api(openai_uri, payload, eval_parameters):
     return _parse_chat_response_format(resp)
 
 
+_PREDICT_ERROR_MSG = """\
+Failed to call the deployment endpoint. Please check the deployment URL\
+is set correctly and the input payload is valid.\n
+- Error: {e}\n
+- Deployment URI: {uri}\n
+- Input payload: {payload}"""
+
+
 def _call_deployments_api(deployment_uri, payload, eval_parameters, wrap_payload=True):
     """Call the deployment endpoint with the given payload and parameters.
 
@@ -142,19 +150,41 @@ def _call_deployments_api(deployment_uri, payload, eval_parameters, wrap_payload
         if wrap_payload:
             payload = {"prompt": payload}
         chat_inputs = {**payload, **eval_parameters}
-        response = client.predict(endpoint=deployment_uri, inputs=chat_inputs)
+        try:
+            response = client.predict(endpoint=deployment_uri, inputs=chat_inputs)
+        except Exception as e:
+            raise MlflowException(
+                _PREDICT_ERROR_MSG.format(e=e, uri=deployment_uri, payload=chat_inputs)
+            ) from e
         return _parse_completions_response_format(response)
     elif endpoint_type == "llm/v1/chat":
         if wrap_payload:
             payload = {"messages": [{"role": "user", "content": payload}]}
         completion_inputs = {**payload, **eval_parameters}
-        response = client.predict(endpoint=deployment_uri, inputs=completion_inputs)
+        try:
+            response = client.predict(endpoint=deployment_uri, inputs=completion_inputs)
+        except Exception as e:
+            raise MlflowException(
+                _PREDICT_ERROR_MSG.format(e=e, uri=deployment_uri, payload=completion_inputs)
+            ) from e
         return _parse_chat_response_format(response)
-
+    elif endpoint_type is None:
+        # If the endpoint type is not specified, we don't assume any format
+        # and directly send the payload to the endpoint. This is primary for Databricks
+        # Managed Agent Evaluation, where the endpoint type may not be specified but the
+        # eval harness ensures that the payload is formatted to the chat format, as well
+        # as parsing the response.
+        inputs = {**payload, **eval_parameters}
+        try:
+            return client.predict(endpoint=deployment_uri, inputs=inputs)
+        except Exception as e:
+            raise MlflowException(
+                _PREDICT_ERROR_MSG.format(e=e, uri=deployment_uri, payload=inputs)
+            ) from e
     else:
         raise MlflowException(
-            f"Unsupported endpoint type: {endpoint_type}. Use an "
-            "endpoint of type 'llm/v1/completions' or 'llm/v1/chat' instead.",
+            f"Unsupported endpoint type: {endpoint_type}. Endpoint type, if specified, "
+            "must be 'llm/v1/completions' or 'llm/v1/chat'.",
             error_code=INVALID_PARAMETER_VALUE,
         )
 
diff --git a/tests/evaluate/test_evaluation.py b/tests/evaluate/test_evaluation.py
@@ -2032,7 +2032,7 @@ def test_evaluate_on_chat_model_endpoint(mock_deploy_client, input_data, feature
             },
         ),
     ]
-    assert all(call in call_args_list for call in expected_calls)
+    assert call_args_list == expected_calls
 
     # Validate the evaluation metrics
     expected_metrics_subset = {"toxicity/v1/ratio", "ari_grade_level/v1/mean"}
@@ -2089,7 +2089,7 @@ def test_evaluate_on_completion_model_endpoint(mock_deploy_client, input_data, f
         mock.call(endpoint="completions", inputs={"prompt": "What is MLflow?", "max_tokens": 10}),
         mock.call(endpoint="completions", inputs={"prompt": "What is Spark?", "max_tokens": 10}),
     ]
-    assert all(call in call_args_list for call in expected_calls)
+    assert call_args_list == expected_calls
 
     # Validate the evaluation metrics
     expected_metrics_subset = {
@@ -2104,6 +2104,90 @@ def test_evaluate_on_completion_model_endpoint(mock_deploy_client, input_data, f
     assert eval_results_table["outputs"].equals(pd.Series(["This is a response"] * 2))
 
 
+@mock.patch("mlflow.deployments.get_deploy_client")
+def test_evaluate_on_model_endpoint_without_type(mock_deploy_client):
+    # An endpoint that does not have endpoint type. For such endpoint, we simply
+    # pass the input data to the endpoint without any modification and return
+    # the response as is.
+    mock_deploy_client.return_value.get_endpoint.return_value = {}
+    mock_deploy_client.return_value.predict.return_value = "This is a response"
+
+    input_data = pd.DataFrame(
+        {
+            "inputs": [
+                {
+                    "messages": [{"content": q, "role": "user"}],
+                    "max_tokens": 10,
+                }
+                for q in _TEST_QUERY_LIST
+            ],
+            "ground_truth": _TEST_GT_LIST,
+        }
+    )
+
+    with mlflow.start_run():
+        eval_result = mlflow.evaluate(
+            model="endpoints:/random",
+            data=input_data,
+            model_type="question-answering",
+            targets="ground_truth",
+            inference_params={"max_tokens": 10, "temperature": 0.5},
+        )
+
+    # Validate the endpoint is called with correct payloads
+    call_args_list = mock_deploy_client.return_value.predict.call_args_list
+    expected_calls = [
+        mock.call(
+            endpoint="random",
+            inputs={
+                "messages": [{"content": "What is MLflow?", "role": "user"}],
+                "max_tokens": 10,
+                "temperature": 0.5,
+            },
+        ),
+        mock.call(
+            endpoint="random",
+            inputs={
+                "messages": [{"content": "What is Spark?", "role": "user"}],
+                "max_tokens": 10,
+                "temperature": 0.5,
+            },
+        ),
+    ]
+    assert call_args_list == expected_calls
+
+    # Validate the evaluation metrics
+    expected_metrics_subset = {"toxicity/v1/ratio", "ari_grade_level/v1/mean", "exact_match/v1"}
+    assert expected_metrics_subset.issubset(set(eval_result.metrics.keys()))
+
+    # Validate the model output is passed to the evaluator in the correct format (string)
+    eval_results_table = eval_result.tables["eval_results_table"]
+    assert eval_results_table["outputs"].equals(pd.Series(["This is a response"] * 2))
+
+
+@mock.patch("mlflow.deployments.get_deploy_client")
+def test_evaluate_on_model_endpoint_invalid_payload(mock_deploy_client):
+    # An endpoint that does not have endpoint type. For such endpoint, we simply
+    # pass the input data to the endpoint without any modification and return
+    # the response as is.
+    mock_deploy_client.return_value.get_endpoint.return_value = {}
+    mock_deploy_client.return_value.predict.side_effect = ValueError("Invalid payload")
+
+    input_data = pd.DataFrame(
+        {
+            "inputs": [{"invalid": "payload"}],
+        }
+    )
+
+    with pytest.raises(MlflowException, match="Failed to call the deployment endpoint"):
+        mlflow.evaluate(
+            model="endpoints:/random",
+            data=input_data,
+            model_type="question-answering",
+            inference_params={"max_tokens": 10, "temperature": 0.5},
+        )
+
+
 @pytest.mark.parametrize(
     ("input_data", "error_message"),
     [