Spaces:

burtenshaw
/

inference-providers-mcp

Running

burtenshaw

switch back to gradio

551ae1a 5 days ago

11.2 kB

	import gradio as gr
	import os
	import requests
	import json
	from typing import List

	# Inference Providers configuration
	PROVIDERS = {
	"cerebras": {
	"name": "Cerebras",
	"tasks": ["chat-completion"],
	"base_url": "https://router.huggingface.co/cerebras",
	},
	"cohere": {
	"name": "Cohere",
	"tasks": ["chat-completion", "chat-completion-vlm"],
	"base_url": "https://router.huggingface.co/cohere",
	},
	"fal-ai": {
	"name": "Fal AI",
	"tasks": ["chat-completion", "chat-completion-vlm"],
	"base_url": "https://router.huggingface.co/fal-ai",
	},
	"featherless-ai": {
	"name": "Featherless AI",
	"tasks": ["chat-completion", "chat-completion-vlm"],
	"base_url": "https://router.huggingface.co/featherless-ai",
	},
	"fireworks-ai": {
	"name": "Fireworks",
	"tasks": ["chat-completion", "chat-completion-vlm"],
	"base_url": "https://router.huggingface.co/fireworks-ai",
	},
	"groq": {
	"name": "Groq",
	"tasks": ["chat-completion"],
	"base_url": "https://router.huggingface.co/groq",
	},
	"hf-inference": {
	"name": "HF Inference",
	"tasks": ["chat-completion", "chat-completion-vlm"],
	"base_url": "https://router.huggingface.co/hf-inference",
	},
	"hyperbolic": {
	"name": "Hyperbolic",
	"tasks": ["chat-completion", "chat-completion-vlm"],
	"base_url": "https://router.huggingface.co/hyperbolic",
	},
	"nebius": {
	"name": "Nebius",
	"tasks": ["chat-completion", "chat-completion-vlm"],
	"base_url": "https://router.huggingface.co/nebius",
	},
	"novita": {
	"name": "Novita",
	"tasks": ["chat-completion", "chat-completion-vlm"],
	"base_url": "https://router.huggingface.co/novita",
	},
	"nscale": {
	"name": "Nscale",
	"tasks": ["chat-completion", "chat-completion-vlm"],
	"base_url": "https://router.huggingface.co/nscale",
	},
	"replicate": {
	"name": "Replicate",
	"tasks": ["chat-completion", "chat-completion-vlm"],
	"base_url": "https://router.huggingface.co/replicate",
	},
	"sambanova": {
	"name": "SambaNova",
	"tasks": ["chat-completion", "chat-completion-vlm"],
	"base_url": "https://router.huggingface.co/sambanova",
	},
	"together": {
	"name": "Together",
	"tasks": ["chat-completion", "chat-completion-vlm"],
	"base_url": "https://router.huggingface.co/together",
	},
	}


	def chat_completion(
	provider: str,
	model: str,
	messages: str,
	temperature: float = 0.7,
	max_tokens: int = 512,
	):
	"""Generate chat completions using Hugging Face Inference Providers.

	This tool provides access to multiple AI providers and language models
	through Hugging Face's unified Inference Providers API.

	Args:
	provider: The inference provider to use. Available providers:
	cerebras, cohere, fal-ai, featherless-ai, fireworks-ai,
	groq, hf-inference, hyperbolic, nebius, novita, nscale,
	replicate, sambanova, together
	model: The model ID from Hugging Face Hub
	(e.g., 'deepseek-ai/DeepSeek-V3-0324')
	messages: Either a JSON array of messages in OpenAI format or
	plain text for simple queries
	temperature: Controls response randomness (0.0-2.0, default 0.7)
	max_tokens: Maximum tokens in response (1-4096, default 512)

	Returns:
	The generated text response from the language model
	"""
	# Get HF token from environment
	hf_token = os.getenv("HF_TOKEN")
	if not hf_token:
	return (
	"Error: HF_TOKEN environment variable is required. "
	"Please set your Hugging Face token."
	)

	# Validate provider
	if provider not in PROVIDERS:
	available = ", ".join(PROVIDERS.keys())
	return f"Error: Unknown provider '{provider}'. Available providers: {available}"

	try:
	# Parse messages
	if messages.strip().startswith("["):
	parsed_messages = json.loads(messages)
	else:
	parsed_messages = [{"role": "user", "content": messages}]

	# Build request payload
	payload = {
	"model": model,
	"messages": parsed_messages,
	"temperature": temperature,
	"max_tokens": max_tokens,
	}

	# Make request to provider
	provider_config = PROVIDERS[provider]
	url = f"{provider_config['base_url']}/v1/chat/completions"
	headers = {
	"Authorization": f"Bearer {hf_token}",
	"Content-Type": "application/json",
	}

	response = requests.post(url, headers=headers, json=payload, timeout=60)
	response.raise_for_status()
	result = response.json()

	# Extract response
	if "choices" in result and len(result["choices"]) > 0:
	return result["choices"][0]["message"]["content"]
	else:
	return f"Error: Unexpected response format: {json.dumps(result, indent=2)}"

	except json.JSONDecodeError:
	return (
	"Error: Invalid JSON format for messages. "
	"Use either plain text or valid JSON array."
	)
	except requests.exceptions.RequestException as e:
	return f"Error: Request failed: {str(e)}"
	except Exception as e:
	return f"Error: {str(e)}"


	def get_providers_for_task(task: str) -> List[str]:
	"""Get available providers for a specific task"""
	return [
	provider for provider, config in PROVIDERS.items() if task in config["tasks"]
	]


	# Create Gradio interface
	with gr.Blocks(title="Inference Providers MCP Server", theme=gr.themes.Soft()) as app:
	gr.Markdown("""
	# 🤖 Inference Providers MCP Server

	A streamlined Model Context Protocol (MCP) server for Hugging Face
	Inference Providers, providing LLMs with access to multiple AI
	providers through a simple, focused interface.

	Supported Providers: Cerebras, Cohere, Fal AI, Featherless AI,
	Fireworks, Groq, HF Inference, Hyperbolic, Nebius, Novita, Nscale,
	Replicate, SambaNova, Together

	Required: Set HF_TOKEN environment variable with your Hugging Face
	token that has Inference Providers access.
	""")

	# Environment status
	hf_token_status = "✅ Set" if os.getenv("HF_TOKEN") else "❌ Not Set"
	gr.Markdown(f"HF_TOKEN Status: {hf_token_status}")

	if not os.getenv("HF_TOKEN"):
	gr.Markdown("""
	⚠️ Setup Required:
	1. Get token: [HF Settings](https://huggingface.co/settings/tokens)
	2. Set environment: `export HF_TOKEN=hf_your_token_here`
	3. Restart application
	""")

	with gr.Tabs():
	# Chat Completion Tab
	with gr.Tab("💬 Chat Completion", id="chat"):
	with gr.Row():
	with gr.Column(scale=1):
	chat_provider = gr.Dropdown(
	choices=get_providers_for_task("chat-completion"),
	label="Provider",
	value="novita",
	info="Select inference provider",
	)
	chat_model = gr.Textbox(
	label="Model",
	value="deepseek-ai/DeepSeek-V3-0324",
	placeholder="e.g., deepseek-ai/DeepSeek-V3-0324",
	info="Model ID from Hugging Face Hub",
	)

	with gr.Column(scale=2):
	chat_messages = gr.Textbox(
	label="Messages",
	lines=8,
	placeholder=(
	'[{"role": "user", "content": "Hello!"}]'
	"\n\nOr just type directly"
	),
	info="JSON array of messages or plain text",
	)

	with gr.Accordion("⚙️ Parameters", open=False):
	with gr.Row():
	chat_temperature = gr.Slider(0.0, 2.0, 0.7, label="Temperature")
	chat_max_tokens = gr.Slider(1, 4096, 512, label="Max Tokens")

	chat_submit = gr.Button("🚀 Generate", variant="primary")
	chat_output = gr.Textbox(label="Response", lines=10)

	chat_submit.click(
	chat_completion,
	inputs=[
	chat_provider,
	chat_model,
	chat_messages,
	chat_temperature,
	chat_max_tokens,
	],
	outputs=chat_output,
	)

	# MCP Documentation Tab
	with gr.Tab("🔧 MCP Setup", id="mcp"):
	gr.Markdown("""
	## 🤖 MCP Server Setup

	This MCP server exposes `chat_completion` tool for LLMs to access
	Hugging Face Inference Providers.

	### 📡 Server URL

	Local: `http://localhost:7860/gradio_api/mcp/sse`

	HF Spaces: `https://username-spacename.hf.space/gradio_api/mcp/sse`

	### ⚙️ Client Configuration

	#### Cursor IDE

	Add to `.cursor/mcp.json`:
	```json
	{
	"mcpServers": {
	"inference-providers": {
	"url": "YOUR_URL/gradio_api/mcp/sse"
	}
	}
	}
	```

	#### Claude Desktop

	Add to MCP settings:
	```json
	{
	"mcpServers": {
	"inference-providers": {
	"command": "npx",
	"args": [
	"mcp-remote",
	"YOUR_URL/gradio_api/mcp/sse",
	"--transport", "sse-only"
	]
	}
	}
	}
	```

	### 🛠️ Tool Details

	`chat_completion` - Generate chat responses

	Parameters:
	- `provider`: Provider name (novita, groq, etc.)
	- `model`: Model ID (deepseek-ai/DeepSeek-V3-0324)
	- `messages`: Input text or JSON messages
	- `temperature`: Randomness (0.0-2.0, default: 0.7)
	- `max_tokens`: Max length (1-4096, default: 512)

	Environment: Requires HF_TOKEN

	### 🎯 Usage

	> "Use chat completion with Groq and Llama to explain Python"

	### 🔗 Links

	- [Cursor MCP](https://docs.cursor.com/context/model-context-protocol)
	- [Gradio MCP Guide](https://huggingface.co/blog/gradio-mcp)
	- [Get HF Token](https://huggingface.co/settings/tokens)
	""")


	if __name__ == "__main__":
	# Enable MCP server functionality
	app.launch(mcp_server=True)