SkyNetWalker commited on
Commit
c2ba929
·
verified ·
1 Parent(s): 36269ba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -44
app.py CHANGED
@@ -1,50 +1,23 @@
1
  import gradio as gr
2
- import requests
3
- import json
4
 
5
- # Define the URL for the local Ollama API and the model name
6
- OLLAMA_API_URL = "http://localhost:11434/api/generate"
7
- # This must match the name used in `ollama pull` in Dockerfile
8
- MODEL_NAME = "gemma3_4b_it_qat"
9
 
10
- def generate_text(prompt, max_new_tokens=256, temperature=0.7):
11
- """
12
- Function to send a prompt to the Ollama API and get a response.
13
- """
14
- payload = {
15
- "model": MODEL_NAME,
16
- "prompt": prompt,
17
- "stream": False, # We want the full response at once
18
- "options": {
19
- "num_predict": max_new_tokens,
20
- "temperature": temperature,
21
- }
22
- }
23
- try:
24
- # Send a POST request to the Ollama API.
25
- # Increased timeout for potentially slow CPU inference.
26
- response = requests.post(OLLAMA_API_URL, json=payload, timeout=600) # 10 minutes timeout
27
- response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
28
- result = response.json()
29
- return result.get("response", "No response from model.")
30
- except requests.exceptions.RequestException as e:
31
- return f"Error communicating with Ollama: {e}"
32
 
33
- # Create the Gradio interface
34
- iface = gr.Interface(
35
- fn=generate_text,
36
- inputs=[
37
- gr.Textbox(lines=5, label="Enter your prompt", placeholder="Type your message here..."),
38
- gr.Slider(minimum=1, maximum=1024, value=256, label="Max New Tokens", info="Maximum number of tokens to generate."),
39
- gr.Slider(minimum=0.1, maximum=1.0, value=0.7, label="Temperature", info="Controls randomness in generation. Lower values are less random.")
40
- ],
41
- outputs="text",
42
- title=f"Ollama {MODEL_NAME} on Hugging Face Spaces (CPU-only)",
43
- description="Interact with a Gemma 3.4B IT QAT GGUF model served by Ollama on CPU. Please be patient, as CPU inference can be slow."
44
  )
45
 
46
- # Launch the Gradio application
47
- # server_name="0.0.0.0" makes it accessible from outside the container.
48
- # server_port=7860 is the default port for Gradio apps on Hugging Face Spaces.
49
- if __name__ == "__main__":
50
- iface.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import gradio as gr
2
+ import ollama
 
3
 
4
+ # The model name must exactly match what was pulled from Hugging Face
5
+ MODEL_NAME = 'hf.co/unsloth/gemma-3-4b-it-qat-GGUF:Q4_K_M'
 
 
6
 
7
+ def predict(prompt, history):
8
+ # The history is not used in this simple example, but is required by the ChatInterface
9
+ response = ollama.chat(
10
+ model=MODEL_NAME,
11
+ messages=[{'role': 'user', 'content': prompt}]
12
+ )
13
+ return response['message']['content']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ # Setup the Gradio Chat Interface
16
+ iface = gr.ChatInterface(
17
+ fn=predict,
18
+ title="Gemma-3 QAT GGUF Chat",
19
+ description=f"Chat with the {MODEL_NAME} model via Ollama."
 
 
 
 
 
 
20
  )
21
 
22
+ # Launch the interface
23
+ iface.launch(server_name="0.0.0.0", server_port=7860)