chenjianfei commited on
Commit
81d00e1
·
1 Parent(s): 73512c3
Files changed (2) hide show
  1. app.py +48 -16
  2. config.py +3 -26
app.py CHANGED
@@ -1,4 +1,3 @@
1
- import ollama
2
  import gradio as gr
3
  import numpy as np
4
  import json
@@ -6,7 +5,7 @@ from tts_api import TTSapi, DEFAULT_TTS_MODEL_NAME
6
  from config import *
7
  from utils import *
8
  from knowledge_base import LocalRAG, CosPlayer
9
-
10
 
11
  def handle_retry(history, thinking_history, config, section_state, retry_data: gr.RetryData):
12
  # 获取用户之前的消息
@@ -94,7 +93,7 @@ def predict(message, chat_history, thinking_history, config, section_state):
94
  input_message = section_state["chat_history"] + [{"role": "user", "content": message}]
95
 
96
  # 关闭Qwen3系列默认的思考模式
97
- if config['llm_model'].startswith('qwen3'):
98
  input_message[-1]['content'] += '/no_think'
99
  # input_message[-1]['content'] += '/no_think'
100
 
@@ -114,15 +113,38 @@ def predict(message, chat_history, thinking_history, config, section_state):
114
  gr.Warning("当前对话已经超出模型上下文长度,请开启新会话...")
115
  try:
116
  # 调用模型
117
- response = ollama.chat(
118
- model=config['llm_model'],
119
- messages=input_message,
120
- stream=False,
121
- options={'num_ctx': min(int(token_cnt * 1.2), MAX_MODEL_CTX)}
122
- )
 
 
123
 
124
- # 解析响应
125
- thinking, response_content = parse_output(response['message']['content'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
  # 更新对话历史
128
  chat_history.append({'role': 'user', 'content': message})
@@ -190,11 +212,21 @@ def predict(message, chat_history, thinking_history, config, section_state):
190
  return "", chat_history, thinking_history, (synthesiser.sr if synthesiser else 16000, audio_output)
191
 
192
 
193
- def init_model(init_llm=False, init_rag=False, init_tts=False):
194
  if init_llm:
195
  print(f'正在加载LLM:{DEFAULT_MODEL_NAME}...')
196
- ollama.chat(model=DEFAULT_MODEL_NAME, messages=[])
197
-
 
 
 
 
 
 
 
 
 
 
198
  if init_rag:
199
  gr.Info("正在加载知识库,请稍候...")
200
  local_rag = LocalRAG(rag_top_k=RAG_TOP_K)
@@ -208,14 +240,14 @@ def init_model(init_llm=False, init_rag=False, init_tts=False):
208
  else:
209
  synthesiser = None
210
  TTS_LOADED = False
211
- return local_rag, synthesiser, TTS_LOADED
212
 
213
 
214
  if __name__ == "__main__":
215
  import time
216
  st = time.time()
217
  print('********************模型加载中************************')
218
- local_rag, synthesiser, TTS_LOADED = init_model()
219
  print('********************模型加载完成************************')
220
  print('耗时:',time.time() - st)
221
 
 
 
1
  import gradio as gr
2
  import numpy as np
3
  import json
 
5
  from config import *
6
  from utils import *
7
  from knowledge_base import LocalRAG, CosPlayer
8
+ from transformers import AutoModelForCausalLM, AutoTokenizer
9
 
10
  def handle_retry(history, thinking_history, config, section_state, retry_data: gr.RetryData):
11
  # 获取用户之前的消息
 
93
  input_message = section_state["chat_history"] + [{"role": "user", "content": message}]
94
 
95
  # 关闭Qwen3系列默认的思考模式
96
+ if config['llm_model'].startswith('Qwen3'):
97
  input_message[-1]['content'] += '/no_think'
98
  # input_message[-1]['content'] += '/no_think'
99
 
 
113
  gr.Warning("当前对话已经超出模型上下文长度,请开启新会话...")
114
  try:
115
  # 调用模型
116
+ if not LLM_LOADED:
117
+ core_llm = AutoModelForCausalLM.from_pretrained(
118
+ config['llm_model'],
119
+ torch_dtype="auto",
120
+ device_map="auto"
121
+ )
122
+ core_tokenizer = AutoTokenizer.from_pretrained(config['llm_model'])
123
+ LLM_LOADED = True
124
 
125
+ text = core_tokenizer.apply_chat_template(
126
+ input_message,
127
+ tokenize=False,
128
+ add_generation_prompt=True
129
+ )
130
+ model_inputs = core_tokenizer([text], return_tensors="pt").to(core_llm.device)
131
+ # conduct text completion
132
+ generated_ids = core_llm.generate(
133
+ **model_inputs,
134
+ max_new_tokens=32768
135
+ )
136
+ output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
137
+
138
+ # parsing thinking content
139
+ # try:
140
+ # # rindex finding 151668 (</think>)
141
+ # index = len(output_ids) - output_ids[::-1].index(151668)
142
+ # except ValueError:
143
+ # index = 0
144
+ index = 0
145
+ # thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
146
+ thinking = None
147
+ response_content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
148
 
149
  # 更新对话历史
150
  chat_history.append({'role': 'user', 'content': message})
 
212
  return "", chat_history, thinking_history, (synthesiser.sr if synthesiser else 16000, audio_output)
213
 
214
 
215
+ def init_model(init_llm=True, init_rag=False, init_tts=False):
216
  if init_llm:
217
  print(f'正在加载LLM:{DEFAULT_MODEL_NAME}...')
218
+ core_llm = AutoModelForCausalLM.from_pretrained(
219
+ DEFAULT_MODEL_NAME,
220
+ torch_dtype="auto",
221
+ device_map="auto"
222
+ )
223
+ print('device:', core_llm.device)
224
+ core_tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL_NAME)
225
+ LLM_LOADED = True
226
+ else:
227
+ core_llm, core_tokenizer = None, None
228
+ LLM_LOADED = False
229
+
230
  if init_rag:
231
  gr.Info("正在加载知识库,请稍候...")
232
  local_rag = LocalRAG(rag_top_k=RAG_TOP_K)
 
240
  else:
241
  synthesiser = None
242
  TTS_LOADED = False
243
+ return local_rag, synthesiser, core_llm, core_tokenizer, TTS_LOADED, LLM_LOADED
244
 
245
 
246
  if __name__ == "__main__":
247
  import time
248
  st = time.time()
249
  print('********************模型加载中************************')
250
+ local_rag, synthesiser, core_llm, core_tokenizer, TTS_LOADED, LLM_LOADED = init_model()
251
  print('********************模型加载完成************************')
252
  print('耗时:',time.time() - st)
253
 
config.py CHANGED
@@ -1,36 +1,13 @@
1
  from pathlib import Path
2
  import os
3
 
4
- DEFAULT_MODEL_NAME = "qwen2.5:32b-instruct"
5
  DEFAULT_MODE = "角色扮演"
6
  DEFAULT_C_SETTING_MODE = "by system"
7
  DEFAULT_COSPLAY_SETTING = 'rag/characters/周杰伦.txt'
8
  AVALIABLE_MODELS = [
9
- "deepseek-r1:7b",
10
- "deepseek-r1:14b",
11
- "deepseek-r1:32b",
12
- "qwq",
13
- "qwen2.5:0.5b-instruct",
14
- "qwen2.5:0.5b",
15
- # "qwen:1.8b",
16
- # "qwen2.5:7b",
17
- # "qwen2.5:14b",
18
- "qwen2.5:32b",
19
- "qwen2.5:32b-instruct",
20
- "qwen7B_jaychou_f16",
21
- "qwen0.5B_jaychou13",
22
- "qwen2.5:14b-instruct",
23
- "qwen2.5:7b-instruct",
24
- "qwen2.5:3b-instruct",
25
- "qwen14B_jaychou_q8_newdata_add_template",
26
- "qwen2.5_32B_jaychou",
27
- "qwen2.5_0.5B_jaychou_lora",
28
- # "qwen2.5_32B_jaychou_tq1"
29
- "qwen3:4b",
30
- "qwen3:8b",
31
- "qwen3:14b",
32
- "qwen3:32b",
33
- "qwen3:30b-a3b"
34
  ]
35
  BASE_MODEL_TABLE = {"qwen7B_jaychou_f16": "qwen2.5:7b-instruct", "qwen0.5B_jaychou13": "qwen2.5:0.5b-instruct",
36
  "qwen14B_jaychou_q8_newdata_add_template": "qwen2.5:14b-instruct",
 
1
  from pathlib import Path
2
  import os
3
 
4
+ DEFAULT_MODEL_NAME = "Qwen/Qwen3-30B-A3B"
5
  DEFAULT_MODE = "角色扮演"
6
  DEFAULT_C_SETTING_MODE = "by system"
7
  DEFAULT_COSPLAY_SETTING = 'rag/characters/周杰伦.txt'
8
  AVALIABLE_MODELS = [
9
+ "Qwen/Qwen3-30B-A3B",
10
+ "Qwen/Qwen2.5-32B-Instruct"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  ]
12
  BASE_MODEL_TABLE = {"qwen7B_jaychou_f16": "qwen2.5:7b-instruct", "qwen0.5B_jaychou13": "qwen2.5:0.5b-instruct",
13
  "qwen14B_jaychou_q8_newdata_add_template": "qwen2.5:14b-instruct",