mojaalagevai commited on
Commit
a745fd4
·
verified ·
1 Parent(s): 72fa1c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -176
app.py CHANGED
@@ -1,12 +1,7 @@
1
- # Importing required libraries
2
- import warnings
3
- warnings.filterwarnings("ignore")
4
-
5
  import os
6
- import json
7
- import subprocess
8
- import sys
9
- from typing import List, Tuple
10
  from llama_cpp import Llama
11
  from llama_cpp_agent import LlamaCppAgent
12
  from llama_cpp_agent import MessagesFormatterType
@@ -14,10 +9,16 @@ from llama_cpp_agent.providers import LlamaCppPythonProvider
14
  from llama_cpp_agent.chat_history import BasicChatHistory
15
  from llama_cpp_agent.chat_history.messages import Roles
16
  from huggingface_hub import hf_hub_download
17
- import gradio as gr
 
18
  from logger import logging
19
  from exception import CustomExceptionHandling
20
 
 
 
 
 
 
21
 
22
  # Download gguf model files
23
  if not os.path.exists("./models"):
@@ -39,65 +40,33 @@ hf_hub_download(
39
  local_dir="./models",
40
  )
41
 
42
-
43
- # Set the title and description
44
- title = "Dolphin🐬 Llama.cpp"
45
- description = """**[Dolphin 3.0](https://huggingface.co/collections/cognitivecomputations/dolphin-30-677ab47f73d7ff66743979a3)** is a powerful, general-purpose local AI model designed for coding, math, and various other tasks, aiming similar to the models like ChatGPT and Claude.
46
- This interactive chat interface allows you to experiment with the [`Dolphin3.0-Qwen2.5-0.5B`](https://huggingface.co/cognitivecomputations/Dolphin3.0-Qwen2.5-0.5B) and [`Dolphin3.0-Llama3.2-1B`](https://huggingface.co/cognitivecomputations/Dolphin3.0-Llama3.2-1B) text models using various prompts and generation parameters.
47
- Users can select different model variants (GGUF format), system prompts, and observe generated responses in real-time.
48
- Key generation parameters, such as ⁣`temperature`, `max_tokens`, `top_k` and others are exposed below for tuning model behavior."""
49
-
50
-
51
  llm = None
52
  llm_model = None
53
 
54
- def respond(
55
- message: str,
56
- history: List[Tuple[str, str]],
57
- model: str = "Dolphin3.0-Qwen2.5-0.5B-Q6_K.gguf", # Set default model
58
- system_message: str = "You are a helpful assistant.",
59
- max_tokens: int = 1024,
60
- temperature: float = 0.7,
61
- top_p: float = 0.95,
62
- top_k: int = 40,
63
- repeat_penalty: float = 1.1,
64
- ):
65
- """
66
- Respond to a message using the Dolphin-3 model via Llama.cpp.
67
-
68
- Args:
69
- - message (str): The message to respond to.
70
- - history (List[Tuple[str, str]]): The chat history.
71
- - model (str): The model to use.
72
- - system_message (str): The system message to use.
73
- - max_tokens (int): The maximum number of tokens to generate.
74
- - temperature (float): The temperature of the model.
75
- - top_p (float): The top-p of the model.
76
- - top_k (int): The top-k of the model.
77
- - repeat_penalty (float): The repetition penalty of the model.
78
-
79
- Returns:
80
- str: The response to the message.
81
- """
82
  try:
83
- # Load the global variables
84
- global llm
85
- global llm_model
86
-
87
- # Ensure model is not None
88
- if model is None:
89
- model = "Dolphin3.0-Qwen2.5-0.5B-Q6_K.gguf"
90
 
91
- # Load the model
92
  if llm is None or llm_model != model:
93
- # Check if model file exists
94
- model_path = f"models/{model}"
95
- if not os.path.exists(model_path):
96
- yield f"Error: Model file not found at {model_path}. Please check your model path."
97
- return
98
-
99
  llm = Llama(
100
- model_path=f"models/{model}",
101
  flash_attn=False,
102
  n_gpu_layers=0,
103
  n_batch=8,
@@ -106,140 +75,58 @@ def respond(
106
  n_threads_batch=8,
107
  )
108
  llm_model = model
 
 
 
 
 
 
 
 
 
109
  provider = LlamaCppPythonProvider(llm)
110
 
111
- # Create the agent
112
  agent = LlamaCppAgent(
113
  provider,
114
- system_prompt=f"{system_message}",
115
  predefined_messages_formatter_type=MessagesFormatterType.CHATML,
116
  debug_output=True,
117
  )
118
 
119
- # Set the settings like temperature, top-k, top-p, max tokens, etc.
120
  settings = provider.get_provider_default_settings()
121
- settings.temperature = temperature
122
- settings.top_k = top_k
123
- settings.top_p = top_p
124
- settings.max_tokens = max_tokens
125
- settings.repeat_penalty = repeat_penalty
126
- settings.stream = True
127
-
 
128
  messages = BasicChatHistory()
 
 
 
129
 
130
- # Add the chat history
131
- for msn in history:
132
- user = {"role": Roles.user, "content": msn[0]}
133
- assistant = {"role": Roles.assistant, "content": msn[1]}
134
- messages.add_message(user)
135
- messages.add_message(assistant)
136
-
137
- # Get the response stream
138
- stream = agent.get_chat_response(
139
- message,
140
  llm_sampling_settings=settings,
141
  chat_history=messages,
142
- returns_streaming_generator=True,
143
  print_output=False,
144
  )
145
 
146
- # Log the success
147
- logging.info("Response stream generated successfully")
148
 
149
- # Generate the response
150
- outputs = ""
151
- for output in stream:
152
- outputs += output
153
- yield outputs
154
-
155
- # Handle exceptions that may occur during the process
156
  except Exception as e:
157
- # Custom exception handling
158
  raise CustomExceptionHandling(e, sys) from e
159
 
 
 
 
160
 
161
- # Create a chat interface
162
- demo = gr.ChatInterface(
163
- respond,
164
- examples=[["What is the capital of France?"], ["Tell me something about artificial intelligence."], ["What is gravity?"]],
165
- additional_inputs_accordion=gr.Accordion(
166
- label="⚙️ Parameters", open=False, render=False
167
- ),
168
- additional_inputs=[
169
- gr.Dropdown(
170
- choices=[
171
- "Dolphin3.0-Llama3.2-1B-Q4_K_M.gguf",
172
- "Dolphin3.0-Qwen2.5-0.5B-Q6_K.gguf",
173
- "Qwen2.5-Coder-14B-Instruct-Q6_K.gguf",
174
- ],
175
- value="Dolphin3.0-Qwen2.5-0.5B-Q6_K.gguf",
176
- label="Model",
177
- info="Select the AI model to use for chat",
178
- ),
179
- gr.Textbox(
180
- value="You are Dolphin, a helpful AI assistant focused on accurate and ethical responses.",
181
- label="System Prompt",
182
- info="Define the AI assistant's personality and behavior",
183
- lines=2,
184
- ),
185
- gr.Slider(
186
- minimum=512,
187
- maximum=2048,
188
- value=1024,
189
- step=1,
190
- label="Max Tokens",
191
- info="Maximum length of response (higher = longer replies)",
192
- ),
193
- gr.Slider(
194
- minimum=0.1,
195
- maximum=2.0,
196
- value=0.7,
197
- step=0.1,
198
- label="Temperature",
199
- info="Creativity level (higher = more creative, lower = more focused)",
200
- ),
201
- gr.Slider(
202
- minimum=0.1,
203
- maximum=1.0,
204
- value=0.95,
205
- step=0.05,
206
- label="Top-p",
207
- info="Nucleus sampling threshold",
208
- ),
209
- gr.Slider(
210
- minimum=1,
211
- maximum=100,
212
- value=40,
213
- step=1,
214
- label="Top-k",
215
- info="Limit vocabulary choices to top K tokens",
216
- ),
217
- gr.Slider(
218
- minimum=1.0,
219
- maximum=2.0,
220
- value=1.1,
221
- step=0.1,
222
- label="Repetition Penalty",
223
- info="Penalize repeated words (higher = less repetition)",
224
- ),
225
- ],
226
- theme="Ocean",
227
- submit_btn="Send",
228
- stop_btn="Stop",
229
- title=title,
230
- description=description,
231
- chatbot=gr.Chatbot(scale=1, show_copy_button=True, resizable=True),
232
- flagging_mode="never",
233
- editable=True,
234
- cache_examples=False,
235
- )
236
-
237
-
238
- # Launch the chat interface
239
  if __name__ == "__main__":
240
- demo.launch(
241
- share=True,
242
- server_name="0.0.0.0",
243
- server_port=7860,
244
- show_api=True,
245
- )
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
+ from typing import List, Tuple, Optional
 
4
  import os
 
 
 
 
5
  from llama_cpp import Llama
6
  from llama_cpp_agent import LlamaCppAgent
7
  from llama_cpp_agent import MessagesFormatterType
 
9
  from llama_cpp_agent.chat_history import BasicChatHistory
10
  from llama_cpp_agent.chat_history.messages import Roles
11
  from huggingface_hub import hf_hub_download
12
+ import logging
13
+ import sys
14
  from logger import logging
15
  from exception import CustomExceptionHandling
16
 
17
+ app = FastAPI(
18
+ title="Dolphin Llama.cpp API",
19
+ description="API for interacting with Dolphin3.0 models using Llama.cpp",
20
+ version="1.0.0"
21
+ )
22
 
23
  # Download gguf model files
24
  if not os.path.exists("./models"):
 
40
  local_dir="./models",
41
  )
42
 
 
 
 
 
 
 
 
 
 
43
  llm = None
44
  llm_model = None
45
 
46
+ class ChatRequest(BaseModel):
47
+ message: str
48
+ history: List[Tuple[str, str]] = []
49
+ model: str = "Dolphin3.0-Qwen2.5-0.5B-Q6_K.gguf"
50
+ system_message: str = "You are Dolphin, a helpful AI assistant focused on accurate and ethical responses."
51
+ max_tokens: int = 1024
52
+ temperature: float = 0.7
53
+ top_p: float = 0.95
54
+ top_k: int = 40
55
+ repeat_penalty: float = 1.1
56
+
57
+ class ChatResponse(BaseModel):
58
+ response: str
59
+
60
+ def initialize_llm(model: str):
61
+ global llm, llm_model
 
 
 
 
 
 
 
 
 
 
 
 
62
  try:
63
+ model_path = f"models/{model}"
64
+ if not os.path.exists(model_path):
65
+ raise HTTPException(status_code=400, detail=f"Model file not found at {model_path}")
 
 
 
 
66
 
 
67
  if llm is None or llm_model != model:
 
 
 
 
 
 
68
  llm = Llama(
69
+ model_path=model_path,
70
  flash_attn=False,
71
  n_gpu_layers=0,
72
  n_batch=8,
 
75
  n_threads_batch=8,
76
  )
77
  llm_model = model
78
+ return llm
79
+ except Exception as e:
80
+ raise CustomExceptionHandling(e, sys) from e
81
+
82
+ @app.post("/chat", response_model=ChatResponse)
83
+ async def chat(request: ChatRequest):
84
+ try:
85
+ # Initialize LLM
86
+ llm = initialize_llm(request.model)
87
  provider = LlamaCppPythonProvider(llm)
88
 
89
+ # Create agent
90
  agent = LlamaCppAgent(
91
  provider,
92
+ system_prompt=request.system_message,
93
  predefined_messages_formatter_type=MessagesFormatterType.CHATML,
94
  debug_output=True,
95
  )
96
 
97
+ # Set sampling settings
98
  settings = provider.get_provider_default_settings()
99
+ settings.temperature = request.temperature
100
+ settings.top_k = request.top_k
101
+ settings.top_p = request.top_p
102
+ settings.max_tokens = request.max_tokens
103
+ settings.repeat_penalty = request.repeat_penalty
104
+ settings.stream = False
105
+
106
+ # Build chat history
107
  messages = BasicChatHistory()
108
+ for user_msg, assistant_msg in request.history:
109
+ messages.add_message({"role": Roles.user, "content": user_msg})
110
+ messages.add_message({"role": Roles.assistant, "content": assistant_msg})
111
 
112
+ # Get response
113
+ response = agent.get_chat_response(
114
+ request.message,
 
 
 
 
 
 
 
115
  llm_sampling_settings=settings,
116
  chat_history=messages,
 
117
  print_output=False,
118
  )
119
 
120
+ logging.info("Response generated successfully")
121
+ return ChatResponse(response=response)
122
 
 
 
 
 
 
 
 
123
  except Exception as e:
 
124
  raise CustomExceptionHandling(e, sys) from e
125
 
126
+ @app.get("/health")
127
+ async def health_check():
128
+ return {"status": "healthy"}
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  if __name__ == "__main__":
131
+ import uvicorn
132
+ uvicorn.run(app, host="0.0.0.0", port=8000)