@@ -2032,7 +2032,7 @@ def test_evaluate_on_chat_model_endpoint(mock_deploy_client, input_data, feature
2032
2032
},
2033
2033
),
2034
2034
]
2035
- assert all ( call in call_args_list for call in expected_calls )
2035
+ assert call_args_list == expected_calls
2036
2036
2037
2037
# Validate the evaluation metrics
2038
2038
expected_metrics_subset = {"toxicity/v1/ratio" , "ari_grade_level/v1/mean" }
@@ -2089,7 +2089,7 @@ def test_evaluate_on_completion_model_endpoint(mock_deploy_client, input_data, f
2089
2089
mock .call (endpoint = "completions" , inputs = {"prompt" : "What is MLflow?" , "max_tokens" : 10 }),
2090
2090
mock .call (endpoint = "completions" , inputs = {"prompt" : "What is Spark?" , "max_tokens" : 10 }),
2091
2091
]
2092
- assert all ( call in call_args_list for call in expected_calls )
2092
+ assert call_args_list == expected_calls
2093
2093
2094
2094
# Validate the evaluation metrics
2095
2095
expected_metrics_subset = {
@@ -2104,6 +2104,90 @@ def test_evaluate_on_completion_model_endpoint(mock_deploy_client, input_data, f
2104
2104
assert eval_results_table ["outputs" ].equals (pd .Series (["This is a response" ] * 2 ))
2105
2105
2106
2106
2107
+ @mock .patch ("mlflow.deployments.get_deploy_client" )
2108
+ def test_evaluate_on_model_endpoint_without_type (mock_deploy_client ):
2109
+ # An endpoint that does not have endpoint type. For such endpoint, we simply
2110
+ # pass the input data to the endpoint without any modification and return
2111
+ # the response as is.
2112
+ mock_deploy_client .return_value .get_endpoint .return_value = {}
2113
+ mock_deploy_client .return_value .predict .return_value = "This is a response"
2114
+
2115
+ input_data = pd .DataFrame (
2116
+ {
2117
+ "inputs" : [
2118
+ {
2119
+ "messages" : [{"content" : q , "role" : "user" }],
2120
+ "max_tokens" : 10 ,
2121
+ }
2122
+ for q in _TEST_QUERY_LIST
2123
+ ],
2124
+ "ground_truth" : _TEST_GT_LIST ,
2125
+ }
2126
+ )
2127
+
2128
+ with mlflow .start_run ():
2129
+ eval_result = mlflow .evaluate (
2130
+ model = "endpoints:/random" ,
2131
+ data = input_data ,
2132
+ model_type = "question-answering" ,
2133
+ targets = "ground_truth" ,
2134
+ inference_params = {"max_tokens" : 10 , "temperature" : 0.5 },
2135
+ )
2136
+
2137
+ # Validate the endpoint is called with correct payloads
2138
+ call_args_list = mock_deploy_client .return_value .predict .call_args_list
2139
+ expected_calls = [
2140
+ mock .call (
2141
+ endpoint = "random" ,
2142
+ inputs = {
2143
+ "messages" : [{"content" : "What is MLflow?" , "role" : "user" }],
2144
+ "max_tokens" : 10 ,
2145
+ "temperature" : 0.5 ,
2146
+ },
2147
+ ),
2148
+ mock .call (
2149
+ endpoint = "random" ,
2150
+ inputs = {
2151
+ "messages" : [{"content" : "What is Spark?" , "role" : "user" }],
2152
+ "max_tokens" : 10 ,
2153
+ "temperature" : 0.5 ,
2154
+ },
2155
+ ),
2156
+ ]
2157
+ assert call_args_list == expected_calls
2158
+
2159
+ # Validate the evaluation metrics
2160
+ expected_metrics_subset = {"toxicity/v1/ratio" , "ari_grade_level/v1/mean" , "exact_match/v1" }
2161
+ assert expected_metrics_subset .issubset (set (eval_result .metrics .keys ()))
2162
+
2163
+ # Validate the model output is passed to the evaluator in the correct format (string)
2164
+ eval_results_table = eval_result .tables ["eval_results_table" ]
2165
+ assert eval_results_table ["outputs" ].equals (pd .Series (["This is a response" ] * 2 ))
2166
+
2167
+
2168
+ @mock .patch ("mlflow.deployments.get_deploy_client" )
2169
+ def test_evaluate_on_model_endpoint_invalid_payload (mock_deploy_client ):
2170
+ # An endpoint that does not have endpoint type. For such endpoint, we simply
2171
+ # pass the input data to the endpoint without any modification and return
2172
+ # the response as is.
2173
+ mock_deploy_client .return_value .get_endpoint .return_value = {}
2174
+ mock_deploy_client .return_value .predict .side_effect = ValueError ("Invalid payload" )
2175
+
2176
+ input_data = pd .DataFrame (
2177
+ {
2178
+ "inputs" : [{"invalid" : "payload" }],
2179
+ }
2180
+ )
2181
+
2182
+ with pytest .raises (MlflowException , match = "Failed to call the deployment endpoint" ):
2183
+ mlflow .evaluate (
2184
+ model = "endpoints:/random" ,
2185
+ data = input_data ,
2186
+ model_type = "question-answering" ,
2187
+ inference_params = {"max_tokens" : 10 , "temperature" : 0.5 },
2188
+ )
2189
+
2190
+
2107
2191
@pytest .mark .parametrize (
2108
2192
("input_data" , "error_message" ),
2109
2193
[
0 commit comments