mojaalagevai commited on
Commit
7d65968
·
verified ·
1 Parent(s): c526306

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -73
app.py CHANGED
@@ -1,110 +1,137 @@
 
 
 
 
1
  from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel
3
- from typing import List, Tuple, Optional
4
- import os
5
  from llama_cpp import Llama
6
- from llama_cpp_agent import LlamaCppAgent
7
- from llama_cpp_agent import MessagesFormatterType
8
  from llama_cpp_agent.providers import LlamaCppPythonProvider
9
  from llama_cpp_agent.chat_history import BasicChatHistory
10
  from llama_cpp_agent.chat_history.messages import Roles
11
- from huggingface_hub import hf_hub_download
12
- import logging
13
- import sys
14
- from logger import logging
15
- from exception import CustomExceptionHandling
16
-
17
- app = FastAPI(
18
- title="Dolphin Llama.cpp API",
19
- description="API for interacting with Dolphin3.0 models using Llama.cpp",
20
- version="1.0.0"
21
- )
22
-
23
- # Download gguf model files
24
- if not os.path.exists("./models"):
25
- os.makedirs("./models")
26
-
27
- hf_hub_download(
28
- repo_id="bartowski/Dolphin3.0-Llama3.2-1B-GGUF",
29
- filename="Dolphin3.0-Llama3.2-1B-Q4_K_M.gguf",
30
- local_dir="./models",
31
- )
32
- hf_hub_download(
33
- repo_id="bartowski/Dolphin3.0-Qwen2.5-0.5B-GGUF",
34
- filename="Dolphin3.0-Qwen2.5-0.5B-Q6_K.gguf",
35
- local_dir="./models",
36
- )
37
- hf_hub_download(
38
- repo_id="bartowski/Qwen2.5-Coder-14B-Instruct-GGUF",
39
- filename="Qwen2.5-Coder-14B-Instruct-Q6_K.gguf",
40
- local_dir="./models",
41
- )
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  llm = None
44
  llm_model = None
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  class ChatRequest(BaseModel):
47
  message: str
48
  history: List[Tuple[str, str]] = []
49
- model: str = "Dolphin3.0-Qwen2.5-0.5B-Q6_K.gguf"
50
- system_message: str = "You are Dolphin, a helpful AI assistant focused on accurate and ethical responses."
51
  max_tokens: int = 1024
52
  temperature: float = 0.7
53
  top_p: float = 0.95
54
  top_k: int = 40
55
  repeat_penalty: float = 1.1
56
 
 
57
  class ChatResponse(BaseModel):
58
  response: str
59
 
60
- def initialize_llm(model: str):
61
- global llm, llm_model
62
- try:
63
- model_path = f"models/{model}"
64
- if not os.path.exists(model_path):
65
- raise HTTPException(status_code=400, detail=f"Model file not found at {model_path}")
66
-
67
- if llm is None or llm_model != model:
68
- llm = Llama(
69
- model_path=model_path,
70
- flash_attn=False,
71
- n_gpu_layers=0,
72
- n_batch=8,
73
- n_ctx=2048,
74
- n_threads=8,
75
- n_threads_batch=8,
76
- )
77
- llm_model = model
78
- return llm
79
- except Exception as e:
80
- raise CustomExceptionHandling(e, sys) from e
81
 
82
  @app.post("/chat", response_model=ChatResponse)
83
- async def chat(request: ChatRequest):
84
  try:
85
- # Initialize LLM
86
- llm = initialize_llm(request.model)
 
87
  provider = LlamaCppPythonProvider(llm)
88
 
89
- # Create agent
90
  agent = LlamaCppAgent(
91
  provider,
92
- system_prompt=request.system_message,
93
  predefined_messages_formatter_type=MessagesFormatterType.CHATML,
94
- debug_output=True,
95
  )
96
 
97
- # Set sampling settings
98
  settings = provider.get_provider_default_settings()
99
  settings.temperature = request.temperature
100
  settings.top_k = request.top_k
101
  settings.top_p = request.top_p
102
  settings.max_tokens = request.max_tokens
103
  settings.repeat_penalty = request.repeat_penalty
104
- settings.stream = False
105
 
106
- # Build chat history
107
  messages = BasicChatHistory()
 
 
108
  for user_msg, assistant_msg in request.history:
109
  messages.add_message({"role": Roles.user, "content": user_msg})
110
  messages.add_message({"role": Roles.assistant, "content": assistant_msg})
@@ -117,15 +144,16 @@ async def chat(request: ChatRequest):
117
  print_output=False,
118
  )
119
 
120
- logging.info("Response generated successfully")
121
- return ChatResponse(response=response)
122
 
123
  except Exception as e:
124
- raise CustomExceptionHandling(e, sys) from e
 
 
 
 
 
125
 
126
- @app.get("/health")
127
- async def health_check():
128
- return {"status": "healthy"}
129
 
130
  if __name__ == "__main__":
131
  import uvicorn
 
1
+ import os
2
+ import sys
3
+ from typing import List, Tuple, Optional
4
+
5
  from fastapi import FastAPI, HTTPException
6
  from pydantic import BaseModel
7
+ from huggingface_hub import hf_hub_download
 
8
  from llama_cpp import Llama
9
+ from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
 
10
  from llama_cpp_agent.providers import LlamaCppPythonProvider
11
  from llama_cpp_agent.chat_history import BasicChatHistory
12
  from llama_cpp_agent.chat_history.messages import Roles
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ # Suppress warnings
15
+ import warnings
16
+ warnings.filterwarnings("ignore")
17
+
18
+ # Ensure models directory exists
19
+ MODEL_DIR = "./models"
20
+ os.makedirs(MODEL_DIR, exist_ok=True)
21
+
22
+ # Model info for download
23
+ MODELS_INFO = [
24
+ {
25
+ "repo_id": "bartowski/Dolphin3.0-Llama3.2-1B-GGUF",
26
+ "filename": "Dolphin3.0-Llama3.2-1B-Q4_K_M.gguf"
27
+ },
28
+ {
29
+ "repo_id": "bartowski/Dolphin3.0-Qwen2.5-0.5B-GGUF",
30
+ "filename": "Dolphin3.0-Qwen2.5-0.5B-Q6_K.gguf"
31
+ },
32
+ {
33
+ "repo_id": "bartowski/Qwen2.5-Coder-14B-Instruct-GGUF",
34
+ "filename": "Qwen2.5-Coder-14B-Instruct-Q6_K.gguf"
35
+ }
36
+ ]
37
+
38
+ # Download all models if not present
39
+ for model_info in MODELS_INFO:
40
+ model_path = os.path.join(MODEL_DIR, model_info["filename"])
41
+ if not os.path.exists(model_path):
42
+ print(f"Downloading {model_info['filename']} from {model_info['repo_id']}...")
43
+ try:
44
+ hf_hub_download(
45
+ repo_id=model_info["repo_id"],
46
+ filename=model_info["filename"],
47
+ local_dir=MODEL_DIR
48
+ )
49
+ print(f"Downloaded {model_info['filename']}")
50
+ except Exception as e:
51
+ print(f"Error downloading {model_info['filename']}: {e}")
52
+
53
+ # Available model keys
54
+ AVAILABLE_MODELS = {
55
+ "llama": "Dolphin3.0-Llama3.2-1B-Q4_K_M.gguf",
56
+ "qwen": "Dolphin3.0-Qwen2.5-0.5B-Q6_K.gguf",
57
+ "coder": "Qwen2.5-Coder-14B-Instruct-Q6_K.gguf"
58
+ }
59
+
60
+ # Global LLM instance
61
  llm = None
62
  llm_model = None
63
 
64
+ def load_model(model_key: str):
65
+ global llm, llm_model
66
+ model_name = AVAILABLE_MODELS.get(model_key)
67
+ if not model_name:
68
+ raise ValueError(f"Invalid model key: {model_key}")
69
+
70
+ model_path = os.path.join(MODEL_DIR, model_name)
71
+ if not os.path.exists(model_path):
72
+ raise FileNotFoundError(f"Model file not found at {model_path}")
73
+
74
+ if llm is None or llm_model != model_name:
75
+ llm = Llama(
76
+ model_path=model_path,
77
+ flash_attn=False,
78
+ n_gpu_layers=0,
79
+ n_batch=8,
80
+ n_ctx=2048,
81
+ n_threads=8,
82
+ n_threads_batch=8,
83
+ )
84
+ llm_model = model_name
85
+ return llm
86
+
87
+
88
  class ChatRequest(BaseModel):
89
  message: str
90
  history: List[Tuple[str, str]] = []
91
+ model: str = "qwen"
92
+ system_prompt: str = "You are Dolphin, a helpful AI assistant."
93
  max_tokens: int = 1024
94
  temperature: float = 0.7
95
  top_p: float = 0.95
96
  top_k: int = 40
97
  repeat_penalty: float = 1.1
98
 
99
+
100
  class ChatResponse(BaseModel):
101
  response: str
102
 
103
+
104
+ app = FastAPI(
105
+ title="Dolphin 3.0 LLM API",
106
+ description="REST API for Dolphin 3.0 models using Llama.cpp backend.",
107
+ version="1.0"
108
+ )
109
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  @app.post("/chat", response_model=ChatResponse)
112
+ def chat(request: ChatRequest):
113
  try:
114
+ # Load model
115
+ load_model(request.model)
116
+
117
  provider = LlamaCppPythonProvider(llm)
118
 
 
119
  agent = LlamaCppAgent(
120
  provider,
121
+ system_prompt=request.system_prompt,
122
  predefined_messages_formatter_type=MessagesFormatterType.CHATML,
 
123
  )
124
 
 
125
  settings = provider.get_provider_default_settings()
126
  settings.temperature = request.temperature
127
  settings.top_k = request.top_k
128
  settings.top_p = request.top_p
129
  settings.max_tokens = request.max_tokens
130
  settings.repeat_penalty = request.repeat_penalty
 
131
 
 
132
  messages = BasicChatHistory()
133
+
134
+ # Add history
135
  for user_msg, assistant_msg in request.history:
136
  messages.add_message({"role": Roles.user, "content": user_msg})
137
  messages.add_message({"role": Roles.assistant, "content": assistant_msg})
 
144
  print_output=False,
145
  )
146
 
147
+ return {"response": response}
 
148
 
149
  except Exception as e:
150
+ raise HTTPException(status_code=500, detail=str(e))
151
+
152
+
153
+ @app.get("/")
154
+ def read_root():
155
+ return {"message": "Welcome to Dolphin 3.0 FastAPI LLM Server!"}
156
 
 
 
 
157
 
158
  if __name__ == "__main__":
159
  import uvicorn