Tim Luka Horstmann commited on
Commit
687de1a
·
1 Parent(s): e112ae1

Add ram usage endpoint

Browse files
Files changed (2) hide show
  1. app.py +31 -10
  2. requirements.txt +2 -1
app.py CHANGED
@@ -12,6 +12,7 @@ import logging
12
  import os
13
  import faiss
14
  import asyncio
 
15
 
16
  # Set up logging
17
  logging.basicConfig(level=logging.INFO)
@@ -105,7 +106,6 @@ def retrieve_context(query, top_k=2):
105
  try:
106
  with open("cv_text.txt", "r", encoding="utf-8") as f:
107
  full_cv_text = f.read()
108
- # Ensure full_cv_text is a string
109
  if not isinstance(full_cv_text, str):
110
  full_cv_text = str(full_cv_text)
111
  logger.info("CV text loaded successfully")
@@ -129,15 +129,12 @@ async def stream_response(query, history):
129
  f"CV: {full_cv_text}"
130
  )
131
 
132
- # Ensure system_prompt is a string and debug its state
133
  if not isinstance(system_prompt, str):
134
  system_prompt = str(system_prompt)
135
  logger.info(f"System prompt type: {type(system_prompt)}, length: {len(system_prompt)}")
136
 
137
- # Combine system prompt, history, and current query
138
  messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": query}]
139
 
140
- # Estimate token counts and truncate history if necessary
141
  try:
142
  system_tokens = len(generator.tokenize(system_prompt.encode('utf-8'), add_bos=True, special=True))
143
  query_tokens = len(generator.tokenize(query.encode('utf-8'), add_bos=False, special=True))
@@ -148,19 +145,16 @@ async def stream_response(query, history):
148
  yield "data: [DONE]\n\n"
149
  return
150
 
151
- total_tokens = system_tokens + query_tokens + sum(history_tokens) + len(history) * 10 + 10 # Rough estimate for formatting
152
-
153
- max_allowed_tokens = generator.n_ctx() - 512 - 100 # max_tokens=512, safety_margin=100
154
 
155
  while total_tokens > max_allowed_tokens and history:
156
  removed_msg = history.pop(0)
157
  removed_tokens = len(generator.tokenize(removed_msg["content"].encode('utf-8'), add_bos=False, special=True))
158
  total_tokens -= (removed_tokens + 10)
159
 
160
- # Reconstruct messages after possible truncation
161
  messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": query}]
162
 
163
- # Generate response with lock
164
  async with model_lock:
165
  try:
166
  for chunk in generator.create_chat_completion(
@@ -187,6 +181,20 @@ class QueryRequest(BaseModel):
187
  query: str
188
  history: list[dict]
189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  @app.post("/api/predict")
191
  async def predict(request: QueryRequest):
192
  query = request.query
@@ -208,6 +216,16 @@ async def model_info():
208
  "faiss_index_dim": cv_embeddings.shape[1],
209
  }
210
 
 
 
 
 
 
 
 
 
 
 
211
  @app.on_event("startup")
212
  async def warm_up_model():
213
  logger.info("Warming up the model...")
@@ -215,4 +233,7 @@ async def warm_up_model():
215
  dummy_history = []
216
  async for _ in stream_response(dummy_query, dummy_history):
217
  pass
218
- logger.info("Model warm-up completed.")
 
 
 
 
12
  import os
13
  import faiss
14
  import asyncio
15
+ import psutil # Added for RAM tracking
16
 
17
  # Set up logging
18
  logging.basicConfig(level=logging.INFO)
 
106
  try:
107
  with open("cv_text.txt", "r", encoding="utf-8") as f:
108
  full_cv_text = f.read()
 
109
  if not isinstance(full_cv_text, str):
110
  full_cv_text = str(full_cv_text)
111
  logger.info("CV text loaded successfully")
 
129
  f"CV: {full_cv_text}"
130
  )
131
 
 
132
  if not isinstance(system_prompt, str):
133
  system_prompt = str(system_prompt)
134
  logger.info(f"System prompt type: {type(system_prompt)}, length: {len(system_prompt)}")
135
 
 
136
  messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": query}]
137
 
 
138
  try:
139
  system_tokens = len(generator.tokenize(system_prompt.encode('utf-8'), add_bos=True, special=True))
140
  query_tokens = len(generator.tokenize(query.encode('utf-8'), add_bos=False, special=True))
 
145
  yield "data: [DONE]\n\n"
146
  return
147
 
148
+ total_tokens = system_tokens + query_tokens + sum(history_tokens) + len(history) * 10 + 10
149
+ max_allowed_tokens = generator.n_ctx() - 512 - 100
 
150
 
151
  while total_tokens > max_allowed_tokens and history:
152
  removed_msg = history.pop(0)
153
  removed_tokens = len(generator.tokenize(removed_msg["content"].encode('utf-8'), add_bos=False, special=True))
154
  total_tokens -= (removed_tokens + 10)
155
 
 
156
  messages = [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": query}]
157
 
 
158
  async with model_lock:
159
  try:
160
  for chunk in generator.create_chat_completion(
 
181
  query: str
182
  history: list[dict]
183
 
184
+ # RAM Usage Tracking Function
185
+ def get_ram_usage():
186
+ memory = psutil.virtual_memory()
187
+ total_ram = memory.total / (1024 ** 3) # Convert to GB
188
+ used_ram = memory.used / (1024 ** 3) # Convert to GB
189
+ free_ram = memory.available / (1024 ** 3) # Convert to GB
190
+ percent_used = memory.percent
191
+ return {
192
+ "total_ram_gb": round(total_ram, 2),
193
+ "used_ram_gb": round(used_ram, 2),
194
+ "free_ram_gb": round(free_ram, 2),
195
+ "percent_used": percent_used
196
+ }
197
+
198
  @app.post("/api/predict")
199
  async def predict(request: QueryRequest):
200
  query = request.query
 
216
  "faiss_index_dim": cv_embeddings.shape[1],
217
  }
218
 
219
+ @app.get("/ram_usage")
220
+ async def ram_usage():
221
+ """Endpoint to get current RAM usage."""
222
+ try:
223
+ ram_stats = get_ram_usage()
224
+ return ram_stats
225
+ except Exception as e:
226
+ logger.error(f"Error retrieving RAM usage: {str(e)}")
227
+ raise HTTPException(status_code=500, detail=f"Error retrieving RAM usage: {str(e)}")
228
+
229
  @app.on_event("startup")
230
  async def warm_up_model():
231
  logger.info("Warming up the model...")
 
233
  dummy_history = []
234
  async for _ in stream_response(dummy_query, dummy_history):
235
  pass
236
+ logger.info("Model warm-up completed.")
237
+ # Log initial RAM usage
238
+ ram_stats = get_ram_usage()
239
+ logger.info(f"Initial RAM usage after startup: {ram_stats}")
requirements.txt CHANGED
@@ -6,4 +6,5 @@ numpy==1.26.4
6
  llama-cpp-python==0.3.1
7
  huggingface_hub==0.30.1
8
  faiss-cpu==1.8.0
9
- asyncio
 
 
6
  llama-cpp-python==0.3.1
7
  huggingface_hub==0.30.1
8
  faiss-cpu==1.8.0
9
+ asyncio
10
+ psutil