|
import gradio as gr |
|
import os |
|
import requests |
|
import json |
|
from typing import List |
|
|
|
|
|
PROVIDERS = { |
|
"cerebras": { |
|
"name": "Cerebras", |
|
"tasks": ["chat-completion"], |
|
"base_url": "https://router.huggingface.co/cerebras", |
|
}, |
|
"cohere": { |
|
"name": "Cohere", |
|
"tasks": ["chat-completion", "chat-completion-vlm"], |
|
"base_url": "https://router.huggingface.co/cohere", |
|
}, |
|
"fal-ai": { |
|
"name": "Fal AI", |
|
"tasks": ["chat-completion", "chat-completion-vlm"], |
|
"base_url": "https://router.huggingface.co/fal-ai", |
|
}, |
|
"featherless-ai": { |
|
"name": "Featherless AI", |
|
"tasks": ["chat-completion", "chat-completion-vlm"], |
|
"base_url": "https://router.huggingface.co/featherless-ai", |
|
}, |
|
"fireworks-ai": { |
|
"name": "Fireworks", |
|
"tasks": ["chat-completion", "chat-completion-vlm"], |
|
"base_url": "https://router.huggingface.co/fireworks-ai", |
|
}, |
|
"groq": { |
|
"name": "Groq", |
|
"tasks": ["chat-completion"], |
|
"base_url": "https://router.huggingface.co/groq", |
|
}, |
|
"hf-inference": { |
|
"name": "HF Inference", |
|
"tasks": ["chat-completion", "chat-completion-vlm"], |
|
"base_url": "https://router.huggingface.co/hf-inference", |
|
}, |
|
"hyperbolic": { |
|
"name": "Hyperbolic", |
|
"tasks": ["chat-completion", "chat-completion-vlm"], |
|
"base_url": "https://router.huggingface.co/hyperbolic", |
|
}, |
|
"nebius": { |
|
"name": "Nebius", |
|
"tasks": ["chat-completion", "chat-completion-vlm"], |
|
"base_url": "https://router.huggingface.co/nebius", |
|
}, |
|
"novita": { |
|
"name": "Novita", |
|
"tasks": ["chat-completion", "chat-completion-vlm"], |
|
"base_url": "https://router.huggingface.co/novita", |
|
}, |
|
"nscale": { |
|
"name": "Nscale", |
|
"tasks": ["chat-completion", "chat-completion-vlm"], |
|
"base_url": "https://router.huggingface.co/nscale", |
|
}, |
|
"replicate": { |
|
"name": "Replicate", |
|
"tasks": ["chat-completion", "chat-completion-vlm"], |
|
"base_url": "https://router.huggingface.co/replicate", |
|
}, |
|
"sambanova": { |
|
"name": "SambaNova", |
|
"tasks": ["chat-completion", "chat-completion-vlm"], |
|
"base_url": "https://router.huggingface.co/sambanova", |
|
}, |
|
"together": { |
|
"name": "Together", |
|
"tasks": ["chat-completion", "chat-completion-vlm"], |
|
"base_url": "https://router.huggingface.co/together", |
|
}, |
|
} |
|
|
|
|
|
def chat_completion( |
|
provider: str, |
|
model: str, |
|
messages: str, |
|
temperature: float = 0.7, |
|
max_tokens: int = 512, |
|
): |
|
"""Generate chat completions using Hugging Face Inference Providers. |
|
|
|
This tool provides access to multiple AI providers and language models |
|
through Hugging Face's unified Inference Providers API. |
|
|
|
Args: |
|
provider: The inference provider to use. Available providers: |
|
cerebras, cohere, fal-ai, featherless-ai, fireworks-ai, |
|
groq, hf-inference, hyperbolic, nebius, novita, nscale, |
|
replicate, sambanova, together |
|
model: The model ID from Hugging Face Hub |
|
(e.g., 'deepseek-ai/DeepSeek-V3-0324') |
|
messages: Either a JSON array of messages in OpenAI format or |
|
plain text for simple queries |
|
temperature: Controls response randomness (0.0-2.0, default 0.7) |
|
max_tokens: Maximum tokens in response (1-4096, default 512) |
|
|
|
Returns: |
|
The generated text response from the language model |
|
""" |
|
|
|
hf_token = os.getenv("HF_TOKEN") |
|
if not hf_token: |
|
return ( |
|
"Error: HF_TOKEN environment variable is required. " |
|
"Please set your Hugging Face token." |
|
) |
|
|
|
|
|
if provider not in PROVIDERS: |
|
available = ", ".join(PROVIDERS.keys()) |
|
return f"Error: Unknown provider '{provider}'. Available providers: {available}" |
|
|
|
try: |
|
|
|
if messages.strip().startswith("["): |
|
parsed_messages = json.loads(messages) |
|
else: |
|
parsed_messages = [{"role": "user", "content": messages}] |
|
|
|
|
|
payload = { |
|
"model": model, |
|
"messages": parsed_messages, |
|
"temperature": temperature, |
|
"max_tokens": max_tokens, |
|
} |
|
|
|
|
|
provider_config = PROVIDERS[provider] |
|
url = f"{provider_config['base_url']}/v1/chat/completions" |
|
headers = { |
|
"Authorization": f"Bearer {hf_token}", |
|
"Content-Type": "application/json", |
|
} |
|
|
|
response = requests.post(url, headers=headers, json=payload, timeout=60) |
|
response.raise_for_status() |
|
result = response.json() |
|
|
|
|
|
if "choices" in result and len(result["choices"]) > 0: |
|
return result["choices"][0]["message"]["content"] |
|
else: |
|
return f"Error: Unexpected response format: {json.dumps(result, indent=2)}" |
|
|
|
except json.JSONDecodeError: |
|
return ( |
|
"Error: Invalid JSON format for messages. " |
|
"Use either plain text or valid JSON array." |
|
) |
|
except requests.exceptions.RequestException as e: |
|
return f"Error: Request failed: {str(e)}" |
|
except Exception as e: |
|
return f"Error: {str(e)}" |
|
|
|
|
|
def get_providers_for_task(task: str) -> List[str]: |
|
"""Get available providers for a specific task""" |
|
return [ |
|
provider for provider, config in PROVIDERS.items() if task in config["tasks"] |
|
] |
|
|
|
|
|
|
|
with gr.Blocks(title="Inference Providers MCP Server", theme=gr.themes.Soft()) as app: |
|
gr.Markdown(""" |
|
# π€ Inference Providers MCP Server |
|
|
|
A streamlined Model Context Protocol (MCP) server for Hugging Face |
|
Inference Providers, providing LLMs with access to multiple AI |
|
providers through a simple, focused interface. |
|
|
|
**Supported Providers:** Cerebras, Cohere, Fal AI, Featherless AI, |
|
Fireworks, Groq, HF Inference, Hyperbolic, Nebius, Novita, Nscale, |
|
Replicate, SambaNova, Together |
|
|
|
**Required:** Set HF_TOKEN environment variable with your Hugging Face |
|
token that has Inference Providers access. |
|
""") |
|
|
|
|
|
hf_token_status = "β
Set" if os.getenv("HF_TOKEN") else "β Not Set" |
|
gr.Markdown(f"**HF_TOKEN Status:** {hf_token_status}") |
|
|
|
if not os.getenv("HF_TOKEN"): |
|
gr.Markdown(""" |
|
**β οΈ Setup Required:** |
|
1. Get token: [HF Settings](https://huggingface.co/settings/tokens) |
|
2. Set environment: `export HF_TOKEN=hf_your_token_here` |
|
3. Restart application |
|
""") |
|
|
|
with gr.Tabs(): |
|
|
|
with gr.Tab("π¬ Chat Completion", id="chat"): |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
chat_provider = gr.Dropdown( |
|
choices=get_providers_for_task("chat-completion"), |
|
label="Provider", |
|
value="novita", |
|
info="Select inference provider", |
|
) |
|
chat_model = gr.Textbox( |
|
label="Model", |
|
value="deepseek-ai/DeepSeek-V3-0324", |
|
placeholder="e.g., deepseek-ai/DeepSeek-V3-0324", |
|
info="Model ID from Hugging Face Hub", |
|
) |
|
|
|
with gr.Column(scale=2): |
|
chat_messages = gr.Textbox( |
|
label="Messages", |
|
lines=8, |
|
placeholder=( |
|
'[{"role": "user", "content": "Hello!"}]' |
|
"\n\nOr just type directly" |
|
), |
|
info="JSON array of messages or plain text", |
|
) |
|
|
|
with gr.Accordion("βοΈ Parameters", open=False): |
|
with gr.Row(): |
|
chat_temperature = gr.Slider(0.0, 2.0, 0.7, label="Temperature") |
|
chat_max_tokens = gr.Slider(1, 4096, 512, label="Max Tokens") |
|
|
|
chat_submit = gr.Button("π Generate", variant="primary") |
|
chat_output = gr.Textbox(label="Response", lines=10) |
|
|
|
chat_submit.click( |
|
chat_completion, |
|
inputs=[ |
|
chat_provider, |
|
chat_model, |
|
chat_messages, |
|
chat_temperature, |
|
chat_max_tokens, |
|
], |
|
outputs=chat_output, |
|
) |
|
|
|
|
|
with gr.Tab("π§ MCP Setup", id="mcp"): |
|
gr.Markdown(""" |
|
## π€ MCP Server Setup |
|
|
|
This MCP server exposes `chat_completion` tool for LLMs to access |
|
Hugging Face Inference Providers. |
|
|
|
### π‘ Server URL |
|
|
|
**Local:** `http://localhost:7860/gradio_api/mcp/sse` |
|
|
|
**HF Spaces:** `https://username-spacename.hf.space/gradio_api/mcp/sse` |
|
|
|
### βοΈ Client Configuration |
|
|
|
#### Cursor IDE |
|
|
|
Add to `.cursor/mcp.json`: |
|
```json |
|
{ |
|
"mcpServers": { |
|
"inference-providers": { |
|
"url": "YOUR_URL/gradio_api/mcp/sse" |
|
} |
|
} |
|
} |
|
``` |
|
|
|
#### Claude Desktop |
|
|
|
Add to MCP settings: |
|
```json |
|
{ |
|
"mcpServers": { |
|
"inference-providers": { |
|
"command": "npx", |
|
"args": [ |
|
"mcp-remote", |
|
"YOUR_URL/gradio_api/mcp/sse", |
|
"--transport", "sse-only" |
|
] |
|
} |
|
} |
|
} |
|
``` |
|
|
|
### π οΈ Tool Details |
|
|
|
**`chat_completion`** - Generate chat responses |
|
|
|
**Parameters:** |
|
- `provider`: Provider name (novita, groq, etc.) |
|
- `model`: Model ID (deepseek-ai/DeepSeek-V3-0324) |
|
- `messages`: Input text or JSON messages |
|
- `temperature`: Randomness (0.0-2.0, default: 0.7) |
|
- `max_tokens`: Max length (1-4096, default: 512) |
|
|
|
**Environment:** Requires HF_TOKEN |
|
|
|
### π― Usage |
|
|
|
> "Use chat completion with Groq and Llama to explain Python" |
|
|
|
### π Links |
|
|
|
- [Cursor MCP](https://docs.cursor.com/context/model-context-protocol) |
|
- [Gradio MCP Guide](https://huggingface.co/blog/gradio-mcp) |
|
- [Get HF Token](https://huggingface.co/settings/tokens) |
|
""") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
app.launch(mcp_server=True) |
|
|