Spaces:

audioer
/

LLM_demo

Running

App Files Files Community

chenjianfei commited on 3 days ago

Commit

81d00e1

1 Parent(s): 73512c3

2

Browse files

Files changed (2) hide show

app.py +48 -16
config.py +3 -26

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import ollama
 import gradio as gr
 import numpy as np
 import json
@@ -6,7 +5,7 @@ from tts_api import TTSapi, DEFAULT_TTS_MODEL_NAME
 from config import *
 from utils import *
 from knowledge_base import LocalRAG, CosPlayer
 def handle_retry(history, thinking_history, config, section_state, retry_data: gr.RetryData):
     # 获取用户之前的消息
@@ -94,7 +93,7 @@ def predict(message, chat_history, thinking_history, config, section_state):
         input_message = section_state["chat_history"] + [{"role": "user", "content": message}]
     #  关闭Qwen3系列默认的思考模式
-    if config['llm_model'].startswith('qwen3'):
         input_message[-1]['content'] += '/no_think'
     # input_message[-1]['content'] += '/no_think'
@@ -114,15 +113,38 @@ def predict(message, chat_history, thinking_history, config, section_state):
         gr.Warning("当前对话已经超出模型上下文长度，请开启新会话...")
     try:
         # 调用模型
-        response = ollama.chat(
-            model=config['llm_model'],
-            messages=input_message,
-            stream=False,
-            options={'num_ctx': min(int(token_cnt * 1.2), MAX_MODEL_CTX)}
-        )
-        # 解析响应
-        thinking, response_content = parse_output(response['message']['content'])
         # 更新对话历史
         chat_history.append({'role': 'user', 'content': message})
@@ -190,11 +212,21 @@ def predict(message, chat_history, thinking_history, config, section_state):
     return "", chat_history, thinking_history, (synthesiser.sr if synthesiser else 16000, audio_output)
-def init_model(init_llm=False, init_rag=False, init_tts=False):
     if init_llm:
         print(f'正在加载LLM:{DEFAULT_MODEL_NAME}...')
-        ollama.chat(model=DEFAULT_MODEL_NAME, messages=[])
     if init_rag:
         gr.Info("正在加载知识库，请稍候...")
         local_rag = LocalRAG(rag_top_k=RAG_TOP_K)
@@ -208,14 +240,14 @@ def init_model(init_llm=False, init_rag=False, init_tts=False):
     else:
         synthesiser = None
         TTS_LOADED = False
-    return local_rag, synthesiser, TTS_LOADED
 if __name__ == "__main__":
     import time
     st = time.time()
     print('********************模型加载中************************')
-    local_rag, synthesiser, TTS_LOADED = init_model()
     print('********************模型加载完成************************')
     print('耗时:',time.time() - st)

 import gradio as gr
 import numpy as np
 import json
 from config import *
 from utils import *
 from knowledge_base import LocalRAG, CosPlayer
+from transformers import AutoModelForCausalLM, AutoTokenizer
 def handle_retry(history, thinking_history, config, section_state, retry_data: gr.RetryData):
     # 获取用户之前的消息
         input_message = section_state["chat_history"] + [{"role": "user", "content": message}]
     #  关闭Qwen3系列默认的思考模式
+    if config['llm_model'].startswith('Qwen3'):
         input_message[-1]['content'] += '/no_think'
     # input_message[-1]['content'] += '/no_think'
         gr.Warning("当前对话已经超出模型上下文长度，请开启新会话...")
     try:
         # 调用模型
+        if not LLM_LOADED:
+            core_llm = AutoModelForCausalLM.from_pretrained(
+                    config['llm_model'],
+                    torch_dtype="auto",
+                    device_map="auto"
+                )
+            core_tokenizer = AutoTokenizer.from_pretrained(config['llm_model'])
+            LLM_LOADED = True
+        text = core_tokenizer.apply_chat_template(
+            input_message,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        model_inputs = core_tokenizer([text], return_tensors="pt").to(core_llm.device)
+        # conduct text completion
+        generated_ids = core_llm.generate(
+            **model_inputs,
+            max_new_tokens=32768
+        )
+        output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
+        # parsing thinking content
+        # try:
+        #     # rindex finding 151668 (</think>)
+        #     index = len(output_ids) - output_ids[::-1].index(151668)
+        # except ValueError:
+        #     index = 0
+        index = 0
+        # thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
+        thinking = None
+        response_content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
         # 更新对话历史
         chat_history.append({'role': 'user', 'content': message})
     return "", chat_history, thinking_history, (synthesiser.sr if synthesiser else 16000, audio_output)
+def init_model(init_llm=True, init_rag=False, init_tts=False):
     if init_llm:
         print(f'正在加载LLM:{DEFAULT_MODEL_NAME}...')
+        core_llm = AutoModelForCausalLM.from_pretrained(
+                    DEFAULT_MODEL_NAME,
+                    torch_dtype="auto",
+                    device_map="auto"
+                )
+        print('device:', core_llm.device)
+        core_tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL_NAME)
+        LLM_LOADED = True
+    else:
+        core_llm, core_tokenizer = None, None
+        LLM_LOADED = False
     if init_rag:
         gr.Info("正在加载知识库，请稍候...")
         local_rag = LocalRAG(rag_top_k=RAG_TOP_K)
     else:
         synthesiser = None
         TTS_LOADED = False
+    return local_rag, synthesiser, core_llm, core_tokenizer, TTS_LOADED, LLM_LOADED
 if __name__ == "__main__":
     import time
     st = time.time()
     print('********************模型加载中************************')
+    local_rag, synthesiser, core_llm, core_tokenizer, TTS_LOADED, LLM_LOADED = init_model()
     print('********************模型加载完成************************')
     print('耗时:',time.time() - st)

config.py CHANGED Viewed

@@ -1,36 +1,13 @@
 from pathlib import Path
 import os
-DEFAULT_MODEL_NAME = "qwen2.5:32b-instruct"
 DEFAULT_MODE = "角色扮演"
 DEFAULT_C_SETTING_MODE = "by system"
 DEFAULT_COSPLAY_SETTING = 'rag/characters/周杰伦.txt'
 AVALIABLE_MODELS = [
-    "deepseek-r1:7b",
-    "deepseek-r1:14b",
-    "deepseek-r1:32b",
-    "qwq",
-    "qwen2.5:0.5b-instruct",
-    "qwen2.5:0.5b",
-    # "qwen:1.8b",
-    # "qwen2.5:7b",
-    # "qwen2.5:14b",
-    "qwen2.5:32b",
-    "qwen2.5:32b-instruct",
-    "qwen7B_jaychou_f16",
-    "qwen0.5B_jaychou13",
-    "qwen2.5:14b-instruct",
-    "qwen2.5:7b-instruct",
-    "qwen2.5:3b-instruct",
-    "qwen14B_jaychou_q8_newdata_add_template",
-    "qwen2.5_32B_jaychou",
-    "qwen2.5_0.5B_jaychou_lora",
-    # "qwen2.5_32B_jaychou_tq1"
-    "qwen3:4b",
-    "qwen3:8b",
-    "qwen3:14b",
-    "qwen3:32b",
-    "qwen3:30b-a3b"
 ]
 BASE_MODEL_TABLE = {"qwen7B_jaychou_f16": "qwen2.5:7b-instruct", "qwen0.5B_jaychou13": "qwen2.5:0.5b-instruct",
                     "qwen14B_jaychou_q8_newdata_add_template": "qwen2.5:14b-instruct",

 from pathlib import Path
 import os
+DEFAULT_MODEL_NAME = "Qwen/Qwen3-30B-A3B"
 DEFAULT_MODE = "角色扮演"
 DEFAULT_C_SETTING_MODE = "by system"
 DEFAULT_COSPLAY_SETTING = 'rag/characters/周杰伦.txt'
 AVALIABLE_MODELS = [
+    "Qwen/Qwen3-30B-A3B",
+    "Qwen/Qwen2.5-32B-Instruct"
 ]
 BASE_MODEL_TABLE = {"qwen7B_jaychou_f16": "qwen2.5:7b-instruct", "qwen0.5B_jaychou13": "qwen2.5:0.5b-instruct",
                     "qwen14B_jaychou_q8_newdata_add_template": "qwen2.5:14b-instruct",