[MCP] Add local/remote endpoint inference support (#3121)

hanouticelina · Wauplin · Wauplin · commit b70c4740c49e · 2025-05-27T11:21:27.000+02:00
* allow endpoint url in tiny-agents

* nit

* explicitly fail if no model or base_url is provided

Co-authored-by: Lucain &lt;lucain@huggingface.co&gt;

---------

Co-authored-by: Lucain &lt;lucain@huggingface.co&gt;
diff --git a/src/huggingface_hub/inference/_mcp/agent.py b/src/huggingface_hub/inference/_mcp/agent.py
@@ -20,14 +20,16 @@ class Agent(MCPClient):
     </Tip>
 
     Args:
-        model (`str`):
+        model (`str`, *optional*):
             The model to run inference with. Can be a model id hosted on the Hugging Face Hub, e.g. `meta-llama/Meta-Llama-3-8B-Instruct`
             or a URL to a deployed Inference Endpoint or other local or remote endpoint.
         servers (`Iterable[Dict]`):
             MCP servers to connect to. Each server is a dictionary containing a `type` key and a `config` key. The `type` key can be `"stdio"` or `"sse"`, and the `config` key is a dictionary of arguments for the server.
         provider (`str`, *optional*):
             Name of the provider to use for inference. Defaults to "auto" i.e. the first of the providers available for the model, sorted by the user's order in https://hf.co/settings/inference-providers.
             If model is a URL or `base_url` is passed, then `provider` is not used.
+        base_url (`str`, *optional*):
+            The base URL to run inference. Defaults to None.
         api_key (`str`, *optional*):
             Token to use for authentication. Will default to the locally Hugging Face saved token if not provided. You can also use your own provider API key to interact directly with the provider's service.
         prompt (`str`, *optional*):
@@ -37,13 +39,14 @@ class Agent(MCPClient):
     def __init__(
         self,
         *,
-        model: str,
+        model: Optional[str] = None,
         servers: Iterable[Dict],
         provider: Optional[PROVIDER_OR_POLICY_T] = None,
+        base_url: Optional[str] = None,
         api_key: Optional[str] = None,
         prompt: Optional[str] = None,
     ):
-        super().__init__(model=model, provider=provider, api_key=api_key)
+        super().__init__(model=model, provider=provider, base_url=base_url, api_key=api_key)
         self._servers_cfg = list(servers)
         self.messages: List[Union[Dict, ChatCompletionInputMessage]] = [
             {"role": "system", "content": prompt or DEFAULT_SYSTEM_PROMPT}
diff --git a/src/huggingface_hub/inference/_mcp/cli.py b/src/huggingface_hub/inference/_mcp/cli.py
@@ -1,6 +1,7 @@
 import asyncio
 import os
 import signal
+import traceback
 from functools import partial
 from typing import Any, Dict, List, Optional
 
@@ -71,8 +72,9 @@ def _sigint_handler() -> None:
             # Windows (or any loop that doesn't support it) : fall back to sync
             signal.signal(signal.SIGINT, lambda *_: _sigint_handler())
         async with Agent(
-            provider=config["provider"],
-            model=config["model"],
+            provider=config.get("provider"),
+            model=config.get("model"),
+            base_url=config.get("endpointUrl"),
             servers=servers,
             prompt=prompt,
         ) as agent:
@@ -123,9 +125,15 @@ def _sigint_handler() -> None:
                     print()
 
                 except Exception as e:
-                    print(f"\n[bold red]Error during agent run: {e}[/bold red]", flush=True)
+                    tb_str = traceback.format_exc()
+                    print(f"\n[bold red]Error during agent run: {e}\n{tb_str}[/bold red]", flush=True)
                     first_sigint = True  # Allow graceful interrupt for the next command
 
+    except Exception as e:
+        tb_str = traceback.format_exc()
+        print(f"\n[bold red]An unexpected error occurred: {e}\n{tb_str}[/bold red]", flush=True)
+        raise e
+
     finally:
         if sigint_registered_in_loop:
             try:
diff --git a/src/huggingface_hub/inference/_mcp/mcp_client.py b/src/huggingface_hub/inference/_mcp/mcp_client.py
@@ -69,24 +69,34 @@ class MCPClient:
         provider (`str`, *optional*):
             Name of the provider to use for inference. Defaults to "auto" i.e. the first of the providers available for the model, sorted by the user's order in https://hf.co/settings/inference-providers.
             If model is a URL or `base_url` is passed, then `provider` is not used.
+        base_url (`str`, *optional*):
+            The base URL to run inference. Defaults to None.
         api_key (`str`, `optional`):
             Token to use for authentication. Will default to the locally Hugging Face saved token if not provided. You can also use your own provider API key to interact directly with the provider's service.
     """
 
     def __init__(
         self,
         *,
-        model: str,
+        model: Optional[str] = None,
         provider: Optional[PROVIDER_OR_POLICY_T] = None,
+        base_url: Optional[str] = None,
         api_key: Optional[str] = None,
     ):
         # Initialize MCP sessions as a dictionary of ClientSession objects
         self.sessions: Dict[ToolName, "ClientSession"] = {}
         self.exit_stack = AsyncExitStack()
         self.available_tools: List[ChatCompletionInputTool] = []
-
-        # Initialize the AsyncInferenceClient
-        self.client = AsyncInferenceClient(model=model, provider=provider, api_key=api_key)
+        # To be able to send the model in the payload if `base_url` is provided
+        if model is None and base_url is None:
+            raise ValueError("At least one of `model` or `base_url` should be set in `MCPClient`.")
+        self.payload_model = model
+        self.client = AsyncInferenceClient(
+            model=None if base_url is not None else model,
+            provider=provider,
+            api_key=api_key,
+            base_url=base_url,
+        )
 
     async def __aenter__(self):
         """Enter the context manager"""
@@ -244,6 +254,7 @@ async def process_single_turn_with_tools(
 
         # Create the streaming request
         response = await self.client.chat.completions.create(
+            model=self.payload_model,
             messages=messages,
             tools=tools,
             tool_choice="auto",