lemonteaa commited on
Commit
19d6e32
·
verified ·
1 Parent(s): f7ba720

Update chat_demo.py

Browse files
Files changed (1) hide show
  1. chat_demo.py +14 -5
chat_demo.py CHANGED
@@ -7,8 +7,15 @@ import tempfile
7
  import subprocess
8
  import threading
9
 
10
- BASE_URL = "http://localhost:5100/v1"
11
- MODEL_NAME = "placeholder-model-id"
 
 
 
 
 
 
 
12
 
13
  def read_output(process):
14
  """Reads the output from the subprocess and prints it to the console."""
@@ -33,12 +40,13 @@ def start_server(command):
33
 
34
  return process
35
 
36
- server_process = start_server(["./llama.cpp/build/bin/llama-server", "-m" ,"./llama.cpp/build/ERNIE-4.5-0.3B-PT-UD-Q8_K_XL.gguf", "-c", "32000", "--jinja", "--no-mmap", "--port", "5100", "--threads", "2"])
 
37
 
38
 
39
  cli = OpenAI(api_key="sk-nokey", base_url=BASE_URL)
40
 
41
- def openai_call(message, history, system_prompt, max_new_tokens):
42
  #print(history) # DEBUG
43
  history.insert(0, {
44
  "role": "system",
@@ -49,7 +57,7 @@ def openai_call(message, history, system_prompt, max_new_tokens):
49
  "content": message
50
  })
51
  response = cli.chat.completions.create(
52
- model=MODEL_NAME,
53
  messages=history,
54
  max_tokens=max_new_tokens,
55
  #stop=["<|im_end|>", "</s>"],
@@ -95,6 +103,7 @@ with gr.Blocks() as demo:
95
  orig_path = gr.State()
96
  chatbot = gr.Chatbot(placeholder="Have fun with the AI!", editable='all', show_copy_button=True)
97
  additional_inputs=[
 
98
  gr.Textbox("You are a helpful AI assistant.", label="System Prompt"),
99
  gr.Slider(30, 8192, value=2048, label="Max new tokens"),
100
  ]
 
7
  import subprocess
8
  import threading
9
 
10
+ MAIN_PORT = 5100
11
+ BASE_URL = f"http://localhost:{MAIN_PORT}/v1"
12
+ #MODEL_NAME = "placeholder-model-id"
13
+ MODEL_LIST = [
14
+ ("Ernie-4.5-0.3B - Good generalist and small", "Ernie-4.5-0.3B"),
15
+ ("LFM2-VL-450M - Stronger RLHF? Weaker in STEM", "LFM2-VL-450M"),
16
+ ("gemma-3-270m-it - Deliberately Raw, need strong system prompt and steering if want assistant behavior", "gemma-3-270m-it"),
17
+ ("Qwen3-0.6B - hybrid thinking /no_think, can do very limited STEM?", "Qwen3-0.6B")
18
+ ]
19
 
20
  def read_output(process):
21
  """Reads the output from the subprocess and prints it to the console."""
 
40
 
41
  return process
42
 
43
+ #server_process = start_server(["./llama.cpp/build/bin/llama-server", "-m" ,"./llama.cpp/build/ERNIE-4.5-0.3B-PT-UD-Q8_K_XL.gguf", "-c", "32000", "--jinja", "--no-mmap", "--port", "5100", "--threads", "2"])
44
+ server_process = start_server(["./llamaswap/llama-swap", "--listen", f"localhost:{MAIN_PORT}", "--config", "./config.yaml"])
45
 
46
 
47
  cli = OpenAI(api_key="sk-nokey", base_url=BASE_URL)
48
 
49
+ def openai_call(message, history, model_chosen, system_prompt, max_new_tokens):
50
  #print(history) # DEBUG
51
  history.insert(0, {
52
  "role": "system",
 
57
  "content": message
58
  })
59
  response = cli.chat.completions.create(
60
+ model=model_chosen,
61
  messages=history,
62
  max_tokens=max_new_tokens,
63
  #stop=["<|im_end|>", "</s>"],
 
103
  orig_path = gr.State()
104
  chatbot = gr.Chatbot(placeholder="Have fun with the AI!", editable='all', show_copy_button=True)
105
  additional_inputs=[
106
+ gr.Dropdown(choices=MODEL_LIST, label="LLM Model"),
107
  gr.Textbox("You are a helpful AI assistant.", label="System Prompt"),
108
  gr.Slider(30, 8192, value=2048, label="Max new tokens"),
109
  ]