import gradio as gr import requests import json # Define the URL for the local Ollama API and the model name OLLAMA_API_URL = "http://localhost:11434/api/generate" # This must match the name used in `ollama pull` in Dockerfile MODEL_NAME = "hf.co/unsloth/gemma-3-4b-it-qat-GGUF:Q4_K_M" def generate_text(prompt, max_new_tokens=256, temperature=0.7): """ Function to send a prompt to the Ollama API and get a response. """ payload = { "model": MODEL_NAME, "prompt": prompt, "stream": False, # We want the full response at once "options": { "num_predict": max_new_tokens, "temperature": temperature, } } try: # Send a POST request to the Ollama API. # Increased timeout for potentially slow CPU inference. response = requests.post(OLLAMA_API_URL, json=payload, timeout=600) # 10 minutes timeout response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx) result = response.json() return result.get("response", "No response from model.") except requests.exceptions.RequestException as e: return f"Error communicating with Ollama: {e}" # Create the Gradio interface iface = gr.Interface( fn=generate_text, inputs=[ gr.Textbox(lines=5, label="Enter your prompt", placeholder="Type your message here..."), gr.Slider(minimum=1, maximum=1024, value=256, label="Max New Tokens", info="Maximum number of tokens to generate."), gr.Slider(minimum=0.1, maximum=1.0, value=0.7, label="Temperature", info="Controls randomness in generation. Lower values are less random.") ], outputs="text", title=f"Ollama {MODEL_NAME} on Hugging Face Spaces (CPU-only)", description="Interact with a Gemma 3.4B IT QAT GGUF model served by Ollama on CPU. Please be patient, as CPU inference can be slow." ) # Launch the Gradio application # server_name="0.0.0.0" makes it accessible from outside the container. # server_port=7860 is the default port for Gradio apps on Hugging Face Spaces. if __name__ == "__main__": iface.launch(server_name="0.0.0.0", server_port=7860)