hsuwill000 commited on
Commit
423bdb4
·
verified ·
1 Parent(s): c4c110e

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +81 -0
  2. llama_cpp_avx512.zip +3 -0
  3. start.sh +12 -0
app.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import socket
2
+ import subprocess
3
+ import gradio as gr
4
+ from openai import OpenAI
5
+
6
+ def get_local_ip():
7
+ # 建立一個 UDP socket,連到外部伺服器(不會真的發送資料)
8
+ s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
9
+ try:
10
+ # 這裡用 Google 的公共 DNS IP 來確保路徑有效
11
+ s.connect(("8.8.8.8", 80))
12
+ ip = s.getsockname()[0]
13
+ except Exception:
14
+ ip = "127.0.0.1"
15
+ finally:
16
+ s.close()
17
+ return ip
18
+
19
+ print("本機 IP:", get_local_ip())
20
+
21
+ # ✅ 設定 base URL 連接本地 llama.cpp API
22
+ client = OpenAI(
23
+ base_url="http://0.0.0.0:8000/v1",
24
+ api_key="sk-local", # llama.cpp 不檢查內容,只要有就行
25
+ timeout=600
26
+ )
27
+
28
+ # ✅ 回應函式(改成 stream 模式)
29
+ def respond(
30
+ message,
31
+ history: list[tuple[str, str]],
32
+ system_message,
33
+ max_tokens,
34
+ temperature,
35
+ top_p,
36
+ ):
37
+ messages = [{"role": "system", "content": system_message}]
38
+
39
+ for user, assistant in history:
40
+ if user:
41
+ messages.append({"role": "user", "content": user})
42
+ if assistant:
43
+ messages.append({"role": "assistant", "content": assistant})
44
+
45
+ messages.append({"role": "user", "content": message})
46
+
47
+ try:
48
+ # 🔹 修改 1: 開啟 stream 模式
49
+ stream = client.chat.completions.create(
50
+ model="qwen3", # ⚠️ 替換成你 llama.cpp 載入的模型名稱
51
+ messages=messages,
52
+ max_tokens=max_tokens,
53
+ temperature=temperature,
54
+ top_p=top_p,
55
+ stream=True,
56
+ )
57
+
58
+ output = ""
59
+ # 🔹 修改 2: 逐步處理流式回應
60
+ for chunk in stream:
61
+ delta = chunk.choices[0].delta.content or ""
62
+ output += delta
63
+ yield output # ✅ 即時回傳給 Gradio
64
+
65
+ except Exception as e:
66
+ print(f"[Error] {e}")
67
+ yield "⚠️ Llama.cpp server 沒有回應,請稍後再試。"
68
+
69
+ # ✅ Gradio 介面(修改 3: 啟用 generator)
70
+ demo = gr.ChatInterface(
71
+ respond,
72
+ additional_inputs=[
73
+ gr.Textbox(value="You are a friendly assistant.", label="System message"),
74
+ gr.Slider(minimum=1, maximum=4096, value=1024, step=1, label="Max new tokens"),
75
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
76
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
77
+ ],
78
+ )
79
+
80
+ if __name__ == "__main__":
81
+ demo.launch()
llama_cpp_avx512.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d460399911f9467107a39ad69b3f0e33d4135b8366aa5f5d6aecf8762f4593c
3
+ size 30338319
start.sh ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+
4
+ export WORK="/home/user/app"
5
+ cd $WORK
6
+ unzip llama_cpp_avx512.zip > /dev/null 2>&1
7
+ wget -O model.gguf https://huggingface.co/unsloth/gpt-oss-20b-GGUF/resolve/main/gpt-oss-20b-Q4_K_M.gguf > /dev/null 2>&1
8
+
9
+ ./llama-server -m model.gguf --port 8000 --host 0.0.0.0 --threads 2 --ctx-size 4096 --mlock --jinja \
10
+ --temp 0.2 \
11
+ --top-p 0.85 &
12
+ python3 app.py