burtenshaw
switch back to gradio
551ae1a
import gradio as gr
import os
import requests
import json
from typing import List
# Inference Providers configuration
PROVIDERS = {
"cerebras": {
"name": "Cerebras",
"tasks": ["chat-completion"],
"base_url": "https://router.huggingface.co/cerebras",
},
"cohere": {
"name": "Cohere",
"tasks": ["chat-completion", "chat-completion-vlm"],
"base_url": "https://router.huggingface.co/cohere",
},
"fal-ai": {
"name": "Fal AI",
"tasks": ["chat-completion", "chat-completion-vlm"],
"base_url": "https://router.huggingface.co/fal-ai",
},
"featherless-ai": {
"name": "Featherless AI",
"tasks": ["chat-completion", "chat-completion-vlm"],
"base_url": "https://router.huggingface.co/featherless-ai",
},
"fireworks-ai": {
"name": "Fireworks",
"tasks": ["chat-completion", "chat-completion-vlm"],
"base_url": "https://router.huggingface.co/fireworks-ai",
},
"groq": {
"name": "Groq",
"tasks": ["chat-completion"],
"base_url": "https://router.huggingface.co/groq",
},
"hf-inference": {
"name": "HF Inference",
"tasks": ["chat-completion", "chat-completion-vlm"],
"base_url": "https://router.huggingface.co/hf-inference",
},
"hyperbolic": {
"name": "Hyperbolic",
"tasks": ["chat-completion", "chat-completion-vlm"],
"base_url": "https://router.huggingface.co/hyperbolic",
},
"nebius": {
"name": "Nebius",
"tasks": ["chat-completion", "chat-completion-vlm"],
"base_url": "https://router.huggingface.co/nebius",
},
"novita": {
"name": "Novita",
"tasks": ["chat-completion", "chat-completion-vlm"],
"base_url": "https://router.huggingface.co/novita",
},
"nscale": {
"name": "Nscale",
"tasks": ["chat-completion", "chat-completion-vlm"],
"base_url": "https://router.huggingface.co/nscale",
},
"replicate": {
"name": "Replicate",
"tasks": ["chat-completion", "chat-completion-vlm"],
"base_url": "https://router.huggingface.co/replicate",
},
"sambanova": {
"name": "SambaNova",
"tasks": ["chat-completion", "chat-completion-vlm"],
"base_url": "https://router.huggingface.co/sambanova",
},
"together": {
"name": "Together",
"tasks": ["chat-completion", "chat-completion-vlm"],
"base_url": "https://router.huggingface.co/together",
},
}
def chat_completion(
provider: str,
model: str,
messages: str,
temperature: float = 0.7,
max_tokens: int = 512,
):
"""Generate chat completions using Hugging Face Inference Providers.
This tool provides access to multiple AI providers and language models
through Hugging Face's unified Inference Providers API.
Args:
provider: The inference provider to use. Available providers:
cerebras, cohere, fal-ai, featherless-ai, fireworks-ai,
groq, hf-inference, hyperbolic, nebius, novita, nscale,
replicate, sambanova, together
model: The model ID from Hugging Face Hub
(e.g., 'deepseek-ai/DeepSeek-V3-0324')
messages: Either a JSON array of messages in OpenAI format or
plain text for simple queries
temperature: Controls response randomness (0.0-2.0, default 0.7)
max_tokens: Maximum tokens in response (1-4096, default 512)
Returns:
The generated text response from the language model
"""
# Get HF token from environment
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
return (
"Error: HF_TOKEN environment variable is required. "
"Please set your Hugging Face token."
)
# Validate provider
if provider not in PROVIDERS:
available = ", ".join(PROVIDERS.keys())
return f"Error: Unknown provider '{provider}'. Available providers: {available}"
try:
# Parse messages
if messages.strip().startswith("["):
parsed_messages = json.loads(messages)
else:
parsed_messages = [{"role": "user", "content": messages}]
# Build request payload
payload = {
"model": model,
"messages": parsed_messages,
"temperature": temperature,
"max_tokens": max_tokens,
}
# Make request to provider
provider_config = PROVIDERS[provider]
url = f"{provider_config['base_url']}/v1/chat/completions"
headers = {
"Authorization": f"Bearer {hf_token}",
"Content-Type": "application/json",
}
response = requests.post(url, headers=headers, json=payload, timeout=60)
response.raise_for_status()
result = response.json()
# Extract response
if "choices" in result and len(result["choices"]) > 0:
return result["choices"][0]["message"]["content"]
else:
return f"Error: Unexpected response format: {json.dumps(result, indent=2)}"
except json.JSONDecodeError:
return (
"Error: Invalid JSON format for messages. "
"Use either plain text or valid JSON array."
)
except requests.exceptions.RequestException as e:
return f"Error: Request failed: {str(e)}"
except Exception as e:
return f"Error: {str(e)}"
def get_providers_for_task(task: str) -> List[str]:
"""Get available providers for a specific task"""
return [
provider for provider, config in PROVIDERS.items() if task in config["tasks"]
]
# Create Gradio interface
with gr.Blocks(title="Inference Providers MCP Server", theme=gr.themes.Soft()) as app:
gr.Markdown("""
# πŸ€– Inference Providers MCP Server
A streamlined Model Context Protocol (MCP) server for Hugging Face
Inference Providers, providing LLMs with access to multiple AI
providers through a simple, focused interface.
**Supported Providers:** Cerebras, Cohere, Fal AI, Featherless AI,
Fireworks, Groq, HF Inference, Hyperbolic, Nebius, Novita, Nscale,
Replicate, SambaNova, Together
**Required:** Set HF_TOKEN environment variable with your Hugging Face
token that has Inference Providers access.
""")
# Environment status
hf_token_status = "βœ… Set" if os.getenv("HF_TOKEN") else "❌ Not Set"
gr.Markdown(f"**HF_TOKEN Status:** {hf_token_status}")
if not os.getenv("HF_TOKEN"):
gr.Markdown("""
**⚠️ Setup Required:**
1. Get token: [HF Settings](https://huggingface.co/settings/tokens)
2. Set environment: `export HF_TOKEN=hf_your_token_here`
3. Restart application
""")
with gr.Tabs():
# Chat Completion Tab
with gr.Tab("πŸ’¬ Chat Completion", id="chat"):
with gr.Row():
with gr.Column(scale=1):
chat_provider = gr.Dropdown(
choices=get_providers_for_task("chat-completion"),
label="Provider",
value="novita",
info="Select inference provider",
)
chat_model = gr.Textbox(
label="Model",
value="deepseek-ai/DeepSeek-V3-0324",
placeholder="e.g., deepseek-ai/DeepSeek-V3-0324",
info="Model ID from Hugging Face Hub",
)
with gr.Column(scale=2):
chat_messages = gr.Textbox(
label="Messages",
lines=8,
placeholder=(
'[{"role": "user", "content": "Hello!"}]'
"\n\nOr just type directly"
),
info="JSON array of messages or plain text",
)
with gr.Accordion("βš™οΈ Parameters", open=False):
with gr.Row():
chat_temperature = gr.Slider(0.0, 2.0, 0.7, label="Temperature")
chat_max_tokens = gr.Slider(1, 4096, 512, label="Max Tokens")
chat_submit = gr.Button("πŸš€ Generate", variant="primary")
chat_output = gr.Textbox(label="Response", lines=10)
chat_submit.click(
chat_completion,
inputs=[
chat_provider,
chat_model,
chat_messages,
chat_temperature,
chat_max_tokens,
],
outputs=chat_output,
)
# MCP Documentation Tab
with gr.Tab("πŸ”§ MCP Setup", id="mcp"):
gr.Markdown("""
## πŸ€– MCP Server Setup
This MCP server exposes `chat_completion` tool for LLMs to access
Hugging Face Inference Providers.
### πŸ“‘ Server URL
**Local:** `http://localhost:7860/gradio_api/mcp/sse`
**HF Spaces:** `https://username-spacename.hf.space/gradio_api/mcp/sse`
### βš™οΈ Client Configuration
#### Cursor IDE
Add to `.cursor/mcp.json`:
```json
{
"mcpServers": {
"inference-providers": {
"url": "YOUR_URL/gradio_api/mcp/sse"
}
}
}
```
#### Claude Desktop
Add to MCP settings:
```json
{
"mcpServers": {
"inference-providers": {
"command": "npx",
"args": [
"mcp-remote",
"YOUR_URL/gradio_api/mcp/sse",
"--transport", "sse-only"
]
}
}
}
```
### πŸ› οΈ Tool Details
**`chat_completion`** - Generate chat responses
**Parameters:**
- `provider`: Provider name (novita, groq, etc.)
- `model`: Model ID (deepseek-ai/DeepSeek-V3-0324)
- `messages`: Input text or JSON messages
- `temperature`: Randomness (0.0-2.0, default: 0.7)
- `max_tokens`: Max length (1-4096, default: 512)
**Environment:** Requires HF_TOKEN
### 🎯 Usage
> "Use chat completion with Groq and Llama to explain Python"
### πŸ”— Links
- [Cursor MCP](https://docs.cursor.com/context/model-context-protocol)
- [Gradio MCP Guide](https://huggingface.co/blog/gradio-mcp)
- [Get HF Token](https://huggingface.co/settings/tokens)
""")
if __name__ == "__main__":
# Enable MCP server functionality
app.launch(mcp_server=True)