Tim Luka Horstmann
commited on
Commit
·
687de1a
1
Parent(s):
e112ae1
Add ram usage endpoint
Browse files- app.py +31 -10
- requirements.txt +2 -1
app.py
CHANGED
@@ -12,6 +12,7 @@ import logging
|
|
12 |
import os
|
13 |
import faiss
|
14 |
import asyncio
|
|
|
15 |
|
16 |
# Set up logging
|
17 |
logging.basicConfig(level=logging.INFO)
|
@@ -105,7 +106,6 @@ def retrieve_context(query, top_k=2):
|
|
105 |
try:
|
106 |
with open("cv_text.txt", "r", encoding="utf-8") as f:
|
107 |
full_cv_text = f.read()
|
108 |
-
# Ensure full_cv_text is a string
|
109 |
if not isinstance(full_cv_text, str):
|
110 |
full_cv_text = str(full_cv_text)
|
111 |
logger.info("CV text loaded successfully")
|
@@ -129,15 +129,12 @@ async def stream_response(query, history):
|
|
129 |
f"CV: {full_cv_text}"
|
130 |
)
|
131 |
|
132 |
-
# Ensure system_prompt is a string and debug its state
|
133 |
if not isinstance(system_prompt, str):
|
134 |
system_prompt = str(system_prompt)
|
135 |
logger.info(f"System prompt type: {type(system_prompt)}, length: {len(system_prompt)}")
|
136 |
|
137 |
-
# Combine system prompt, history, and current query
|
138 |
messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": query}]
|
139 |
|
140 |
-
# Estimate token counts and truncate history if necessary
|
141 |
try:
|
142 |
system_tokens = len(generator.tokenize(system_prompt.encode('utf-8'), add_bos=True, special=True))
|
143 |
query_tokens = len(generator.tokenize(query.encode('utf-8'), add_bos=False, special=True))
|
@@ -148,19 +145,16 @@ async def stream_response(query, history):
|
|
148 |
yield "data: [DONE]\n\n"
|
149 |
return
|
150 |
|
151 |
-
total_tokens = system_tokens + query_tokens + sum(history_tokens) + len(history) * 10 + 10
|
152 |
-
|
153 |
-
max_allowed_tokens = generator.n_ctx() - 512 - 100 # max_tokens=512, safety_margin=100
|
154 |
|
155 |
while total_tokens > max_allowed_tokens and history:
|
156 |
removed_msg = history.pop(0)
|
157 |
removed_tokens = len(generator.tokenize(removed_msg["content"].encode('utf-8'), add_bos=False, special=True))
|
158 |
total_tokens -= (removed_tokens + 10)
|
159 |
|
160 |
-
# Reconstruct messages after possible truncation
|
161 |
messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": query}]
|
162 |
|
163 |
-
# Generate response with lock
|
164 |
async with model_lock:
|
165 |
try:
|
166 |
for chunk in generator.create_chat_completion(
|
@@ -187,6 +181,20 @@ class QueryRequest(BaseModel):
|
|
187 |
query: str
|
188 |
history: list[dict]
|
189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
@app.post("/api/predict")
|
191 |
async def predict(request: QueryRequest):
|
192 |
query = request.query
|
@@ -208,6 +216,16 @@ async def model_info():
|
|
208 |
"faiss_index_dim": cv_embeddings.shape[1],
|
209 |
}
|
210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
@app.on_event("startup")
|
212 |
async def warm_up_model():
|
213 |
logger.info("Warming up the model...")
|
@@ -215,4 +233,7 @@ async def warm_up_model():
|
|
215 |
dummy_history = []
|
216 |
async for _ in stream_response(dummy_query, dummy_history):
|
217 |
pass
|
218 |
-
logger.info("Model warm-up completed.")
|
|
|
|
|
|
|
|
12 |
import os
|
13 |
import faiss
|
14 |
import asyncio
|
15 |
+
import psutil # Added for RAM tracking
|
16 |
|
17 |
# Set up logging
|
18 |
logging.basicConfig(level=logging.INFO)
|
|
|
106 |
try:
|
107 |
with open("cv_text.txt", "r", encoding="utf-8") as f:
|
108 |
full_cv_text = f.read()
|
|
|
109 |
if not isinstance(full_cv_text, str):
|
110 |
full_cv_text = str(full_cv_text)
|
111 |
logger.info("CV text loaded successfully")
|
|
|
129 |
f"CV: {full_cv_text}"
|
130 |
)
|
131 |
|
|
|
132 |
if not isinstance(system_prompt, str):
|
133 |
system_prompt = str(system_prompt)
|
134 |
logger.info(f"System prompt type: {type(system_prompt)}, length: {len(system_prompt)}")
|
135 |
|
|
|
136 |
messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": query}]
|
137 |
|
|
|
138 |
try:
|
139 |
system_tokens = len(generator.tokenize(system_prompt.encode('utf-8'), add_bos=True, special=True))
|
140 |
query_tokens = len(generator.tokenize(query.encode('utf-8'), add_bos=False, special=True))
|
|
|
145 |
yield "data: [DONE]\n\n"
|
146 |
return
|
147 |
|
148 |
+
total_tokens = system_tokens + query_tokens + sum(history_tokens) + len(history) * 10 + 10
|
149 |
+
max_allowed_tokens = generator.n_ctx() - 512 - 100
|
|
|
150 |
|
151 |
while total_tokens > max_allowed_tokens and history:
|
152 |
removed_msg = history.pop(0)
|
153 |
removed_tokens = len(generator.tokenize(removed_msg["content"].encode('utf-8'), add_bos=False, special=True))
|
154 |
total_tokens -= (removed_tokens + 10)
|
155 |
|
|
|
156 |
messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": query}]
|
157 |
|
|
|
158 |
async with model_lock:
|
159 |
try:
|
160 |
for chunk in generator.create_chat_completion(
|
|
|
181 |
query: str
|
182 |
history: list[dict]
|
183 |
|
184 |
+
# RAM Usage Tracking Function
|
185 |
+
def get_ram_usage():
|
186 |
+
memory = psutil.virtual_memory()
|
187 |
+
total_ram = memory.total / (1024 ** 3) # Convert to GB
|
188 |
+
used_ram = memory.used / (1024 ** 3) # Convert to GB
|
189 |
+
free_ram = memory.available / (1024 ** 3) # Convert to GB
|
190 |
+
percent_used = memory.percent
|
191 |
+
return {
|
192 |
+
"total_ram_gb": round(total_ram, 2),
|
193 |
+
"used_ram_gb": round(used_ram, 2),
|
194 |
+
"free_ram_gb": round(free_ram, 2),
|
195 |
+
"percent_used": percent_used
|
196 |
+
}
|
197 |
+
|
198 |
@app.post("/api/predict")
|
199 |
async def predict(request: QueryRequest):
|
200 |
query = request.query
|
|
|
216 |
"faiss_index_dim": cv_embeddings.shape[1],
|
217 |
}
|
218 |
|
219 |
+
@app.get("/ram_usage")
|
220 |
+
async def ram_usage():
|
221 |
+
"""Endpoint to get current RAM usage."""
|
222 |
+
try:
|
223 |
+
ram_stats = get_ram_usage()
|
224 |
+
return ram_stats
|
225 |
+
except Exception as e:
|
226 |
+
logger.error(f"Error retrieving RAM usage: {str(e)}")
|
227 |
+
raise HTTPException(status_code=500, detail=f"Error retrieving RAM usage: {str(e)}")
|
228 |
+
|
229 |
@app.on_event("startup")
|
230 |
async def warm_up_model():
|
231 |
logger.info("Warming up the model...")
|
|
|
233 |
dummy_history = []
|
234 |
async for _ in stream_response(dummy_query, dummy_history):
|
235 |
pass
|
236 |
+
logger.info("Model warm-up completed.")
|
237 |
+
# Log initial RAM usage
|
238 |
+
ram_stats = get_ram_usage()
|
239 |
+
logger.info(f"Initial RAM usage after startup: {ram_stats}")
|
requirements.txt
CHANGED
@@ -6,4 +6,5 @@ numpy==1.26.4
|
|
6 |
llama-cpp-python==0.3.1
|
7 |
huggingface_hub==0.30.1
|
8 |
faiss-cpu==1.8.0
|
9 |
-
asyncio
|
|
|
|
6 |
llama-cpp-python==0.3.1
|
7 |
huggingface_hub==0.30.1
|
8 |
faiss-cpu==1.8.0
|
9 |
+
asyncio
|
10 |
+
psutil
|