mcp_mod_test / app /main.py
mtyrrell's picture
test
9a00c34
raw
history blame
4.3 kB
import os, asyncio, logging
import gradio as gr
from huggingface_hub import InferenceClient
from .prompt import build_prompt
# ---------------------------------------------------------------------
# model / client initialisation
# ---------------------------------------------------------------------
HF_TOKEN = os.getenv("HF_TOKEN")
MODEL_ID = os.getenv("MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct")
MAX_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "512"))
TEMPERATURE = float(os.getenv("TEMPERATURE", "0.2"))
if not HF_TOKEN:
raise RuntimeError(
"HF_TOKEN env-var missing. "
)
client = InferenceClient(model=MODEL_ID, token=HF_TOKEN)
# ---------------------------------------------------------------------
# Core generation function for both Gradio UI and MCP
# ---------------------------------------------------------------------
async def _call_llm(prompt: str) -> str:
"""
Try text_generation first (for models/providers that still support it);
fall back to chat_completion when the provider is chat-only (Novita, etc.).
"""
try:
# hf-inference
return await asyncio.to_thread(
client.text_generation,
prompt,
max_new_tokens=MAX_TOKENS,
temperature=TEMPERATURE,
)
except ValueError as e:
if "Supported task: conversational" not in str(e):
raise # genuine error → bubble up
# fallback for Novita
messages = [{"role": "user", "content": prompt}]
completion = await asyncio.to_thread(
client.chat_completion,
messages=messages,
model=MODEL_ID,
max_tokens=MAX_TOKENS,
temperature=TEMPERATURE,
)
return completion.choices[0].message.content.strip()
async def rag_generate(query: str, context: str) -> str:
"""
Generate an answer to a query using provided context through RAG.
This function takes a user query and relevant context, then uses a language model
to generate a comprehensive answer based on the provided information.
Args:
query (str): The user's question or query
context (str): The relevant context/documents to use for answering
Returns:
str: The generated answer based on the query and context
"""
if not query.strip():
return "Error: Query cannot be empty"
if not context.strip():
return "Error: Context cannot be empty"
prompt = build_prompt(query, context)
try:
answer = await _call_llm(prompt)
return answer
except Exception as e:
logging.exception("Generation failed")
return f"Error: {str(e)}"
# ---------------------------------------------------------------------
# Gradio Interface with MCP support
# ---------------------------------------------------------------------
ui = gr.Interface(
fn=rag_generate,
inputs=[
gr.Textbox(
label="Query",
lines=2,
placeholder="What would you like to know?",
info="Enter your question here"
),
gr.Textbox(
label="Context",
lines=8,
placeholder="Paste relevant documents or context here...",
info="Provide the context/documents to use for answering"
),
],
outputs=gr.Textbox(
label="Generated Answer",
lines=6,
show_copy_button=True
),
title="RAG Generation Service",
description="Ask questions and get answers based on your provided context. This service is also available as an MCP server for integration with AI applications.",
examples=[
[
"What is the main benefit mentioned?",
"Machine learning has revolutionized many industries. The main benefit is increased efficiency and accuracy in data processing."
],
[
"Who is the CEO?",
"Company ABC was founded in 2020. The current CEO is Jane Smith, who has led the company to significant growth."
]
]
)
# Launch with MCP server enabled
if __name__ == "__main__":
ui.launch(
server_name="0.0.0.0",
server_port=7860,
mcp_server=True,
show_error=True
)