import os, asyncio, logging import gradio as gr from huggingface_hub import InferenceClient from .prompt import build_prompt # --------------------------------------------------------------------- # model / client initialisation # --------------------------------------------------------------------- HF_TOKEN = os.getenv("HF_TOKEN") MODEL_ID = os.getenv("MODEL_ID", "meta-llama/Meta-Llama-3-8B-Instruct") MAX_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "512")) TEMPERATURE = float(os.getenv("TEMPERATURE", "0.2")) if not HF_TOKEN: raise RuntimeError( "HF_TOKEN env-var missing. " ) client = InferenceClient(model=MODEL_ID, token=HF_TOKEN) # --------------------------------------------------------------------- # Core generation function for both Gradio UI and MCP # --------------------------------------------------------------------- async def _call_llm(prompt: str) -> str: """ Try text_generation first (for models/providers that still support it); fall back to chat_completion when the provider is chat-only (Novita, etc.). """ try: # hf-inference return await asyncio.to_thread( client.text_generation, prompt, max_new_tokens=MAX_TOKENS, temperature=TEMPERATURE, ) except ValueError as e: if "Supported task: conversational" not in str(e): raise # genuine error → bubble up # fallback for Novita messages = [{"role": "user", "content": prompt}] completion = await asyncio.to_thread( client.chat_completion, messages=messages, model=MODEL_ID, max_tokens=MAX_TOKENS, temperature=TEMPERATURE, ) return completion.choices[0].message.content.strip() async def rag_generate(query: str, context: str) -> str: """ Generate an answer to a query using provided context through RAG. This function takes a user query and relevant context, then uses a language model to generate a comprehensive answer based on the provided information. Args: query (str): The user's question or query context (str): The relevant context/documents to use for answering Returns: str: The generated answer based on the query and context """ if not query.strip(): return "Error: Query cannot be empty" if not context.strip(): return "Error: Context cannot be empty" prompt = build_prompt(query, context) try: answer = await _call_llm(prompt) return answer except Exception as e: logging.exception("Generation failed") return f"Error: {str(e)}" # --------------------------------------------------------------------- # Gradio Interface with MCP support # --------------------------------------------------------------------- ui = gr.Interface( fn=rag_generate, inputs=[ gr.Textbox( label="Query", lines=2, placeholder="What would you like to know?", info="Enter your question here" ), gr.Textbox( label="Context", lines=8, placeholder="Paste relevant documents or context here...", info="Provide the context/documents to use for answering" ), ], outputs=gr.Textbox( label="Generated Answer", lines=6, show_copy_button=True ), title="RAG Generation Service", description="Ask questions and get answers based on your provided context. This service is also available as an MCP server for integration with AI applications.", examples=[ [ "What is the main benefit mentioned?", "Machine learning has revolutionized many industries. The main benefit is increased efficiency and accuracy in data processing." ], [ "Who is the CEO?", "Company ABC was founded in 2020. The current CEO is Jane Smith, who has led the company to significant growth." ] ] ) # Launch with MCP server enabled if __name__ == "__main__": ui.launch( server_name="0.0.0.0", server_port=7860, mcp_server=True, show_error=True )