import gradio as gr import os import requests import json from typing import List # Inference Providers configuration PROVIDERS = { "cerebras": { "name": "Cerebras", "tasks": ["chat-completion"], "base_url": "https://router.huggingface.co/cerebras", }, "cohere": { "name": "Cohere", "tasks": ["chat-completion", "chat-completion-vlm"], "base_url": "https://router.huggingface.co/cohere", }, "fal-ai": { "name": "Fal AI", "tasks": ["chat-completion", "chat-completion-vlm"], "base_url": "https://router.huggingface.co/fal-ai", }, "featherless-ai": { "name": "Featherless AI", "tasks": ["chat-completion", "chat-completion-vlm"], "base_url": "https://router.huggingface.co/featherless-ai", }, "fireworks-ai": { "name": "Fireworks", "tasks": ["chat-completion", "chat-completion-vlm"], "base_url": "https://router.huggingface.co/fireworks-ai", }, "groq": { "name": "Groq", "tasks": ["chat-completion"], "base_url": "https://router.huggingface.co/groq", }, "hf-inference": { "name": "HF Inference", "tasks": ["chat-completion", "chat-completion-vlm"], "base_url": "https://router.huggingface.co/hf-inference", }, "hyperbolic": { "name": "Hyperbolic", "tasks": ["chat-completion", "chat-completion-vlm"], "base_url": "https://router.huggingface.co/hyperbolic", }, "nebius": { "name": "Nebius", "tasks": ["chat-completion", "chat-completion-vlm"], "base_url": "https://router.huggingface.co/nebius", }, "novita": { "name": "Novita", "tasks": ["chat-completion", "chat-completion-vlm"], "base_url": "https://router.huggingface.co/novita", }, "nscale": { "name": "Nscale", "tasks": ["chat-completion", "chat-completion-vlm"], "base_url": "https://router.huggingface.co/nscale", }, "replicate": { "name": "Replicate", "tasks": ["chat-completion", "chat-completion-vlm"], "base_url": "https://router.huggingface.co/replicate", }, "sambanova": { "name": "SambaNova", "tasks": ["chat-completion", "chat-completion-vlm"], "base_url": "https://router.huggingface.co/sambanova", }, "together": { "name": "Together", "tasks": ["chat-completion", "chat-completion-vlm"], "base_url": "https://router.huggingface.co/together", }, } def chat_completion( provider: str, model: str, messages: str, temperature: float = 0.7, max_tokens: int = 512, ): """Generate chat completions using Hugging Face Inference Providers. This tool provides access to multiple AI providers and language models through Hugging Face's unified Inference Providers API. Args: provider: The inference provider to use. Available providers: cerebras, cohere, fal-ai, featherless-ai, fireworks-ai, groq, hf-inference, hyperbolic, nebius, novita, nscale, replicate, sambanova, together model: The model ID from Hugging Face Hub (e.g., 'deepseek-ai/DeepSeek-V3-0324') messages: Either a JSON array of messages in OpenAI format or plain text for simple queries temperature: Controls response randomness (0.0-2.0, default 0.7) max_tokens: Maximum tokens in response (1-4096, default 512) Returns: The generated text response from the language model """ # Get HF token from environment hf_token = os.getenv("HF_TOKEN") if not hf_token: return ( "Error: HF_TOKEN environment variable is required. " "Please set your Hugging Face token." ) # Validate provider if provider not in PROVIDERS: available = ", ".join(PROVIDERS.keys()) return f"Error: Unknown provider '{provider}'. Available providers: {available}" try: # Parse messages if messages.strip().startswith("["): parsed_messages = json.loads(messages) else: parsed_messages = [{"role": "user", "content": messages}] # Build request payload payload = { "model": model, "messages": parsed_messages, "temperature": temperature, "max_tokens": max_tokens, } # Make request to provider provider_config = PROVIDERS[provider] url = f"{provider_config['base_url']}/v1/chat/completions" headers = { "Authorization": f"Bearer {hf_token}", "Content-Type": "application/json", } response = requests.post(url, headers=headers, json=payload, timeout=60) response.raise_for_status() result = response.json() # Extract response if "choices" in result and len(result["choices"]) > 0: return result["choices"][0]["message"]["content"] else: return f"Error: Unexpected response format: {json.dumps(result, indent=2)}" except json.JSONDecodeError: return ( "Error: Invalid JSON format for messages. " "Use either plain text or valid JSON array." ) except requests.exceptions.RequestException as e: return f"Error: Request failed: {str(e)}" except Exception as e: return f"Error: {str(e)}" def get_providers_for_task(task: str) -> List[str]: """Get available providers for a specific task""" return [ provider for provider, config in PROVIDERS.items() if task in config["tasks"] ] # Create Gradio interface with gr.Blocks(title="Inference Providers MCP Server", theme=gr.themes.Soft()) as app: gr.Markdown(""" # 🤖 Inference Providers MCP Server A streamlined Model Context Protocol (MCP) server for Hugging Face Inference Providers, providing LLMs with access to multiple AI providers through a simple, focused interface. **Supported Providers:** Cerebras, Cohere, Fal AI, Featherless AI, Fireworks, Groq, HF Inference, Hyperbolic, Nebius, Novita, Nscale, Replicate, SambaNova, Together **Required:** Set HF_TOKEN environment variable with your Hugging Face token that has Inference Providers access. """) # Environment status hf_token_status = "✅ Set" if os.getenv("HF_TOKEN") else "❌ Not Set" gr.Markdown(f"**HF_TOKEN Status:** {hf_token_status}") if not os.getenv("HF_TOKEN"): gr.Markdown(""" **⚠️ Setup Required:** 1. Get token: [HF Settings](https://huggingface.co/settings/tokens) 2. Set environment: `export HF_TOKEN=hf_your_token_here` 3. Restart application """) with gr.Tabs(): # Chat Completion Tab with gr.Tab("💬 Chat Completion", id="chat"): with gr.Row(): with gr.Column(scale=1): chat_provider = gr.Dropdown( choices=get_providers_for_task("chat-completion"), label="Provider", value="novita", info="Select inference provider", ) chat_model = gr.Textbox( label="Model", value="deepseek-ai/DeepSeek-V3-0324", placeholder="e.g., deepseek-ai/DeepSeek-V3-0324", info="Model ID from Hugging Face Hub", ) with gr.Column(scale=2): chat_messages = gr.Textbox( label="Messages", lines=8, placeholder=( '[{"role": "user", "content": "Hello!"}]' "\n\nOr just type directly" ), info="JSON array of messages or plain text", ) with gr.Accordion("⚙️ Parameters", open=False): with gr.Row(): chat_temperature = gr.Slider(0.0, 2.0, 0.7, label="Temperature") chat_max_tokens = gr.Slider(1, 4096, 512, label="Max Tokens") chat_submit = gr.Button("🚀 Generate", variant="primary") chat_output = gr.Textbox(label="Response", lines=10) chat_submit.click( chat_completion, inputs=[ chat_provider, chat_model, chat_messages, chat_temperature, chat_max_tokens, ], outputs=chat_output, ) # MCP Documentation Tab with gr.Tab("🔧 MCP Setup", id="mcp"): gr.Markdown(""" ## 🤖 MCP Server Setup This MCP server exposes `chat_completion` tool for LLMs to access Hugging Face Inference Providers. ### 📡 Server URL **Local:** `http://localhost:7860/gradio_api/mcp/sse` **HF Spaces:** `https://username-spacename.hf.space/gradio_api/mcp/sse` ### ⚙️ Client Configuration #### Cursor IDE Add to `.cursor/mcp.json`: ```json { "mcpServers": { "inference-providers": { "url": "YOUR_URL/gradio_api/mcp/sse" } } } ``` #### Claude Desktop Add to MCP settings: ```json { "mcpServers": { "inference-providers": { "command": "npx", "args": [ "mcp-remote", "YOUR_URL/gradio_api/mcp/sse", "--transport", "sse-only" ] } } } ``` ### 🛠️ Tool Details **`chat_completion`** - Generate chat responses **Parameters:** - `provider`: Provider name (novita, groq, etc.) - `model`: Model ID (deepseek-ai/DeepSeek-V3-0324) - `messages`: Input text or JSON messages - `temperature`: Randomness (0.0-2.0, default: 0.7) - `max_tokens`: Max length (1-4096, default: 512) **Environment:** Requires HF_TOKEN ### 🎯 Usage > "Use chat completion with Groq and Llama to explain Python" ### 🔗 Links - [Cursor MCP](https://docs.cursor.com/context/model-context-protocol) - [Gradio MCP Guide](https://huggingface.co/blog/gradio-mcp) - [Get HF Token](https://huggingface.co/settings/tokens) """) if __name__ == "__main__": # Enable MCP server functionality app.launch(mcp_server=True)