Spaces:

hsuwill000
/

ESP01LLMSample

Sleeping

App Files Files Community

hsuwill000 commited on 18 days ago

Commit

defa84e

verified ·

1 Parent(s): 0584daf

Update app.py

Browse files

Files changed (1) hide show

app.py +156 -60

app.py CHANGED Viewed

@@ -1,70 +1,166 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-def respond(
-    message,
-    history: list[dict[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-    hf_token: gr.OAuthToken,
-):
-    """
-    For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
     """
-    client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
     messages = [{"role": "system", "content": system_message}]
-    messages.extend(history)
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        choices = message.choices
-        token = ""
-        if len(choices) and choices[0].delta.content:
-            token = choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-chatbot = gr.ChatInterface(
-    respond,
-    type="messages",
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-with gr.Blocks() as demo:
-    with gr.Sidebar():
-        gr.LoginButton()
-    chatbot.render()
 if __name__ == "__main__":
-    demo.launch()

+# app.py
+import os
 import gradio as gr
+from typing import List, Dict
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+# --- 1. 模型設定與下載 ---
+# 您指定的模型資訊
+MODEL_NAME = "Qwen3-0.6B-Q8_0.gguf"
+MODEL_REPO = "Qwen/Qwen3-0.6B-GGUF"
+# 固定的系統提示
+DEFAULT_SYSTEM_MESSAGE = "You are a friendly and helpful assistant. Please answer the user's questions concisely and accurately."
+# 步驟 1: 下載 GGUF 模型
+# 模型會被下載到 ~/.cache/huggingface/hub/ 或指定的快取目錄
+try:
+    print(f"嘗試從 {MODEL_REPO} 下載 {MODEL_NAME}...")
+    model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_NAME)
+    print(f"模型下載完成，路徑: {model_path}")
+except Exception as e:
+    print(f"**錯誤**：無法下載模型。請檢查網路連線或模型名稱/權限。錯誤訊息: {e}")
+    # 在 Gradio Space 中，如果模型無法下載，應用程式會無法啟動。
+    # 這裡可以選擇性地退出或使用本地路徑作為備用（如果存在）。
+    exit(1)
+# --- 2. Llama.cpp 初始化 ---
+# 步驟 2: 初始化 Llama.cpp 實例
+# n_gpu_layers=0 表示不使用 GPU (CPU 推論)，如果環境支援 CUDA/cuBLAS，可以設定為 >0
+try:
+    print("正在初始化 Llama.cpp 實例...")
+    llm = Llama(
+        model_path=model_path,
+        n_ctx=4096,  # 上下文長度
+        n_batch=512, # 批次大小
+        n_threads=os.cpu_count() // 2 or 1, # 使用一半的 CPU 核心
+        n_gpu_layers=0, # CPU 推論
+        verbose=False # 關閉內部日誌輸出
+    )
+    print("Llama.cpp 模型加載成功。")
+except Exception as e:
+    print(f"**錯誤**：Llama.cpp 實例初始化失敗。錯誤訊息: {e}")
+    exit(1)
+# --- 3. 推論核心函式 ---
+def llama_inference(
+    message: str,
+    chat_history: List[List[str]],
+    system_message: str = DEFAULT_SYSTEM_MESSAGE,
+    max_tokens: int = 4096,
+    temperature: float = 0.7,
+    top_p: float = 0.95
+) -> str:
     """
+    使用 Llama.cpp 實例執行推論並返回回應。
+    :param message: 當前的使用者輸入。
+    :param chat_history: Gradio 傳遞的聊天歷史記錄 (list of [user, bot] pairs)。
+    :return: LLM 的回應文字。
+    """
+    # 將 Gradio 的聊天歷史轉換為 Llama.cpp/OpenAI 格式的 messages 列表
     messages = [{"role": "system", "content": system_message}]
+    for human, assistant in chat_history:
+        # 歷史對話
+        messages.append({"role": "user", "content": human})
+        messages.append({"role": "assistant", "content": assistant})
+    # 當前訊息
+    messages.append({"role": "user", "content": message})
+    try:
+        # 呼叫 Llama.cpp 的 create_chat_completion 介面 (與 OpenAI 格式相容)
+        response = llm.create_chat_completion(
+            messages=messages,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            # stream=False 是預設值
+        )
+        # 解析回應
+        if response.get('choices') and response['choices'][0].get('message'):
+            content = response['choices'][0]['message'].get('content', "⚠️ LLM 服務回傳空內容。")
+            return content
+        return "⚠️ LLM 服務回傳空內容。"
+    except Exception as e:
+        print(f"[Error] Llama Inference failed: {e}")
+        return f"❌ 伺服器錯誤 (Llama.cpp 推論失敗): {e}"
+# --- 4. Gradio 介面設定 ---
+# 定義 Gradio 聊天函式 (用於更新介面)
+def chat_interface(message: str, history: List[List[str]]):
+    """Gradio 介面調用函式。"""
+    # 這裡可以固定或從另一個輸入元件獲取參數，為了簡化，使用硬編碼值
+    response = llama_inference(
+        message=message,
+        chat_history=history,
+        system_message=DEFAULT_SYSTEM_MESSAGE,
+        max_tokens=4096,
+        temperature=0.7,
+        top_p=0.95
+    )
+    # Gradio 聊天介面要求回傳回應文字
+    return response
+# 建立 Gradio 介面
+with gr.Blocks(title="Qwen3-0.6B-GGUF 聊天機器人") as demo:
+    gr.Markdown(
+        f"""
+        # Qwen3-0.6B-GGUF 聊天機器人
+        使用 **llama-cpp-python** 模組運行 **{MODEL_NAME}** 模型。
+        """
+    )
+    # 聊天元件
+    chatbot = gr.Chatbot(
+        label="聊天記錄",
+        height=500
+    )
+    # 聊天輸入元件
+    chat_input = gr.Textbox(
+        show_label=False,
+        placeholder="請輸入你的問題...",
+        container=False
+    )
+    # 綁定聊天邏輯
+    # submit 觸發事件：
+    # - fn: 要執行的 Python 函式 (chat_interface)
+    # - inputs: 函式接收的輸入 ([Textbox 的內容, Chatbot 的歷史])
+    # - outputs: 函式輸出的結果 (Chatbot 的新歷史)
+    chat_input.submit(
+        fn=chat_interface,
+        inputs=[chat_input, chatbot],
+        outputs=chatbot
+    ).then(
+        # 清空輸入框
+        fn=lambda: "",
+        inputs=None,
+        outputs=chat_input,
+        queue=False
+    )
+# 啟動應用程式
 if __name__ == "__main__":
+    # 在 Gradio Space 中，會使用 gunicorn 或類似服務來運行，但如果要在本地測試，可以使用以下命令：
+    # python app.py
+    demo.launch(server_name="0.0.0.0", server_port=7860)