Spaces:

burtenshaw
/

inference-providers-mcp

Running

File size: 11,210 Bytes

551ae1a

import gradio as gr
import os
import requests
import json
from typing import List

# Inference Providers configuration
PROVIDERS = {
    "cerebras": {
        "name": "Cerebras",
        "tasks": ["chat-completion"],
        "base_url": "https://router.huggingface.co/cerebras",
    },
    "cohere": {
        "name": "Cohere",
        "tasks": ["chat-completion", "chat-completion-vlm"],
        "base_url": "https://router.huggingface.co/cohere",
    },
    "fal-ai": {
        "name": "Fal AI",
        "tasks": ["chat-completion", "chat-completion-vlm"],
        "base_url": "https://router.huggingface.co/fal-ai",
    },
    "featherless-ai": {
        "name": "Featherless AI",
        "tasks": ["chat-completion", "chat-completion-vlm"],
        "base_url": "https://router.huggingface.co/featherless-ai",
    },
    "fireworks-ai": {
        "name": "Fireworks",
        "tasks": ["chat-completion", "chat-completion-vlm"],
        "base_url": "https://router.huggingface.co/fireworks-ai",
    },
    "groq": {
        "name": "Groq",
        "tasks": ["chat-completion"],
        "base_url": "https://router.huggingface.co/groq",
    },
    "hf-inference": {
        "name": "HF Inference",
        "tasks": ["chat-completion", "chat-completion-vlm"],
        "base_url": "https://router.huggingface.co/hf-inference",
    },
    "hyperbolic": {
        "name": "Hyperbolic",
        "tasks": ["chat-completion", "chat-completion-vlm"],
        "base_url": "https://router.huggingface.co/hyperbolic",
    },
    "nebius": {
        "name": "Nebius",
        "tasks": ["chat-completion", "chat-completion-vlm"],
        "base_url": "https://router.huggingface.co/nebius",
    },
    "novita": {
        "name": "Novita",
        "tasks": ["chat-completion", "chat-completion-vlm"],
        "base_url": "https://router.huggingface.co/novita",
    },
    "nscale": {
        "name": "Nscale",
        "tasks": ["chat-completion", "chat-completion-vlm"],
        "base_url": "https://router.huggingface.co/nscale",
    },
    "replicate": {
        "name": "Replicate",
        "tasks": ["chat-completion", "chat-completion-vlm"],
        "base_url": "https://router.huggingface.co/replicate",
    },
    "sambanova": {
        "name": "SambaNova",
        "tasks": ["chat-completion", "chat-completion-vlm"],
        "base_url": "https://router.huggingface.co/sambanova",
    },
    "together": {
        "name": "Together",
        "tasks": ["chat-completion", "chat-completion-vlm"],
        "base_url": "https://router.huggingface.co/together",
    },
}


def chat_completion(
    provider: str,
    model: str,
    messages: str,
    temperature: float = 0.7,
    max_tokens: int = 512,
):
    """Generate chat completions using Hugging Face Inference Providers.

    This tool provides access to multiple AI providers and language models
    through Hugging Face's unified Inference Providers API.

    Args:
        provider: The inference provider to use. Available providers:
                 cerebras, cohere, fal-ai, featherless-ai, fireworks-ai,
                 groq, hf-inference, hyperbolic, nebius, novita, nscale,
                 replicate, sambanova, together
        model: The model ID from Hugging Face Hub
               (e.g., 'deepseek-ai/DeepSeek-V3-0324')
        messages: Either a JSON array of messages in OpenAI format or
                 plain text for simple queries
        temperature: Controls response randomness (0.0-2.0, default 0.7)
        max_tokens: Maximum tokens in response (1-4096, default 512)

    Returns:
        The generated text response from the language model
    """
    # Get HF token from environment
    hf_token = os.getenv("HF_TOKEN")
    if not hf_token:
        return (
            "Error: HF_TOKEN environment variable is required. "
            "Please set your Hugging Face token."
        )

    # Validate provider
    if provider not in PROVIDERS:
        available = ", ".join(PROVIDERS.keys())
        return f"Error: Unknown provider '{provider}'. Available providers: {available}"

    try:
        # Parse messages
        if messages.strip().startswith("["):
            parsed_messages = json.loads(messages)
        else:
            parsed_messages = [{"role": "user", "content": messages}]

        # Build request payload
        payload = {
            "model": model,
            "messages": parsed_messages,
            "temperature": temperature,
            "max_tokens": max_tokens,
        }

        # Make request to provider
        provider_config = PROVIDERS[provider]
        url = f"{provider_config['base_url']}/v1/chat/completions"
        headers = {
            "Authorization": f"Bearer {hf_token}",
            "Content-Type": "application/json",
        }

        response = requests.post(url, headers=headers, json=payload, timeout=60)
        response.raise_for_status()
        result = response.json()

        # Extract response
        if "choices" in result and len(result["choices"]) > 0:
            return result["choices"][0]["message"]["content"]
        else:
            return f"Error: Unexpected response format: {json.dumps(result, indent=2)}"

    except json.JSONDecodeError:
        return (
            "Error: Invalid JSON format for messages. "
            "Use either plain text or valid JSON array."
        )
    except requests.exceptions.RequestException as e:
        return f"Error: Request failed: {str(e)}"
    except Exception as e:
        return f"Error: {str(e)}"


def get_providers_for_task(task: str) -> List[str]:
    """Get available providers for a specific task"""
    return [
        provider for provider, config in PROVIDERS.items() if task in config["tasks"]
    ]


# Create Gradio interface
with gr.Blocks(title="Inference Providers MCP Server", theme=gr.themes.Soft()) as app:
    gr.Markdown("""
    # 🤖 Inference Providers MCP Server
    
    A streamlined Model Context Protocol (MCP) server for Hugging Face 
    Inference Providers, providing LLMs with access to multiple AI 
    providers through a simple, focused interface.
    
    **Supported Providers:** Cerebras, Cohere, Fal AI, Featherless AI, 
    Fireworks, Groq, HF Inference, Hyperbolic, Nebius, Novita, Nscale, 
    Replicate, SambaNova, Together
    
    **Required:** Set HF_TOKEN environment variable with your Hugging Face 
    token that has Inference Providers access.
    """)

    # Environment status
    hf_token_status = "✅ Set" if os.getenv("HF_TOKEN") else "❌ Not Set"
    gr.Markdown(f"**HF_TOKEN Status:** {hf_token_status}")

    if not os.getenv("HF_TOKEN"):
        gr.Markdown("""
        **⚠️ Setup Required:**
        1. Get token: [HF Settings](https://huggingface.co/settings/tokens)
        2. Set environment: `export HF_TOKEN=hf_your_token_here`
        3. Restart application
        """)

    with gr.Tabs():
        # Chat Completion Tab
        with gr.Tab("💬 Chat Completion", id="chat"):
            with gr.Row():
                with gr.Column(scale=1):
                    chat_provider = gr.Dropdown(
                        choices=get_providers_for_task("chat-completion"),
                        label="Provider",
                        value="novita",
                        info="Select inference provider",
                    )
                    chat_model = gr.Textbox(
                        label="Model",
                        value="deepseek-ai/DeepSeek-V3-0324",
                        placeholder="e.g., deepseek-ai/DeepSeek-V3-0324",
                        info="Model ID from Hugging Face Hub",
                    )

                with gr.Column(scale=2):
                    chat_messages = gr.Textbox(
                        label="Messages",
                        lines=8,
                        placeholder=(
                            '[{"role": "user", "content": "Hello!"}]'
                            "\n\nOr just type directly"
                        ),
                        info="JSON array of messages or plain text",
                    )

            with gr.Accordion("⚙️ Parameters", open=False):
                with gr.Row():
                    chat_temperature = gr.Slider(0.0, 2.0, 0.7, label="Temperature")
                    chat_max_tokens = gr.Slider(1, 4096, 512, label="Max Tokens")

            chat_submit = gr.Button("🚀 Generate", variant="primary")
            chat_output = gr.Textbox(label="Response", lines=10)

            chat_submit.click(
                chat_completion,
                inputs=[
                    chat_provider,
                    chat_model,
                    chat_messages,
                    chat_temperature,
                    chat_max_tokens,
                ],
                outputs=chat_output,
            )

        # MCP Documentation Tab
        with gr.Tab("🔧 MCP Setup", id="mcp"):
            gr.Markdown("""
            ## 🤖 MCP Server Setup
            
            This MCP server exposes `chat_completion` tool for LLMs to access
            Hugging Face Inference Providers.
            
            ### 📡 Server URL
            
            **Local:** `http://localhost:7860/gradio_api/mcp/sse`
            
            **HF Spaces:** `https://username-spacename.hf.space/gradio_api/mcp/sse`
            
            ### ⚙️ Client Configuration
            
            #### Cursor IDE
            
            Add to `.cursor/mcp.json`:
            ```json
            {
              "mcpServers": {
                "inference-providers": {
                  "url": "YOUR_URL/gradio_api/mcp/sse"
                }
              }
            }
            ```
            
            #### Claude Desktop
            
            Add to MCP settings:
            ```json
            {
              "mcpServers": {
                "inference-providers": {
                  "command": "npx",
                  "args": [
                    "mcp-remote", 
                    "YOUR_URL/gradio_api/mcp/sse",
                    "--transport", "sse-only"
                  ]
                }
              }
            }
            ```
            
            ### 🛠️ Tool Details
            
            **`chat_completion`** - Generate chat responses
            
            **Parameters:**
            - `provider`: Provider name (novita, groq, etc.)
            - `model`: Model ID (deepseek-ai/DeepSeek-V3-0324)
            - `messages`: Input text or JSON messages
            - `temperature`: Randomness (0.0-2.0, default: 0.7)
            - `max_tokens`: Max length (1-4096, default: 512)
            
            **Environment:** Requires HF_TOKEN
            
            ### 🎯 Usage
            
            > "Use chat completion with Groq and Llama to explain Python"
            
            ### 🔗 Links
            
            - [Cursor MCP](https://docs.cursor.com/context/model-context-protocol)
            - [Gradio MCP Guide](https://huggingface.co/blog/gradio-mcp)
            - [Get HF Token](https://huggingface.co/settings/tokens)
            """)


if __name__ == "__main__":
    # Enable MCP server functionality
    app.launch(mcp_server=True)