Tim Luka Horstmann commited on
Commit
58d2235
·
1 Parent(s): dc475e9

increased batch size again

Browse files
Files changed (2) hide show
  1. app.py +30 -27
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,5 +1,3 @@
1
- # app.py
2
-
3
  from datetime import datetime
4
  import json
5
  import time
@@ -13,6 +11,7 @@ from huggingface_hub import login, hf_hub_download
13
  import logging
14
  import os
15
  import faiss
 
16
 
17
  # Set up logging
18
  logging.basicConfig(level=logging.INFO)
@@ -20,6 +19,9 @@ logger = logging.getLogger(__name__)
20
 
21
  app = FastAPI()
22
 
 
 
 
23
  # Authenticate with Hugging Face
24
  hf_token = os.getenv("HF_TOKEN")
25
  if not hf_token:
@@ -29,11 +31,11 @@ login(token=hf_token)
29
 
30
  # Models Configuration
31
  sentence_transformer_model = "all-MiniLM-L6-v2"
32
- # Upgrade to the 8B model and choose Q4_0 quantization for a good balance of performance and resource usage.
33
  repo_id = "bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF"
34
- filename = "deepcogito_cogito-v1-preview-llama-8B-Q4_K_M.gguf" # New 8B model with Q4_0 quantization
35
 
36
- # Define FAQs (unchanged)
37
  faqs = [
38
  {"question": "What is your name?", "answer": "My name is Tim Luka Horstmann."},
39
  {"question": "Where do you live?", "answer": "I live in Paris, France."},
@@ -45,7 +47,7 @@ faqs = [
45
  ]
46
 
47
  try:
48
- # Load CV embeddings and build FAISS index (unchanged)
49
  logger.info("Loading CV embeddings from cv_embeddings.json")
50
  with open("cv_embeddings.json", "r", encoding="utf-8") as f:
51
  cv_data = json.load(f)
@@ -74,12 +76,12 @@ try:
74
  local_dir="/app/cache" if os.getenv("HF_HOME") else None,
75
  token=hf_token,
76
  )
77
- # Lower n_batch for more frequent token streaming.
78
  generator = Llama(
79
  model_path=model_path,
80
  n_ctx=2048,
81
  n_threads=2,
82
- n_batch=128, # Adjusted for lower latency on streaming responses
83
  n_gpu_layers=0,
84
  verbose=True,
85
  )
@@ -104,7 +106,7 @@ def retrieve_context(query, top_k=2):
104
  with open("cv_text.txt", "r", encoding="utf-8") as f:
105
  full_cv_text = f.read()
106
 
107
- def stream_response(query):
108
  logger.info(f"Processing query: {query}")
109
  start_time = time.time()
110
  first_token_logged = False
@@ -139,21 +141,22 @@ def stream_response(query):
139
  {"role": "user", "content": query}
140
  ]
141
 
142
- # Stream tokens immediately as they are generated, avoiding additional buffering.
143
- for chunk in generator.create_chat_completion(
144
- messages=messages,
145
- max_tokens=512,
146
- stream=True,
147
- temperature=0.3,
148
- top_p=0.7,
149
- repeat_penalty=1.2
150
- ):
151
- token = chunk['choices'][0]['delta'].get('content', '')
152
- if token:
153
- if not first_token_logged:
154
- logger.info(f"First token time: {time.time() - start_time:.2f}s")
155
- first_token_logged = True
156
- yield f"data: {token}\n\n"
 
157
  yield "data: [DONE]\n\n"
158
 
159
  class QueryRequest(BaseModel):
@@ -181,10 +184,10 @@ async def model_info():
181
  "faiss_index_dim": cv_embeddings.shape[1],
182
  }
183
 
184
- # Use a smaller warm-up query to prime the model without extensive delay.
185
  @app.on_event("startup")
186
  async def warm_up_model():
187
  logger.info("Warming up the model...")
188
  dummy_query = "Hello"
189
- next(stream_response(dummy_query))
190
- logger.info("Model warm-up initiated.")
 
 
 
 
1
  from datetime import datetime
2
  import json
3
  import time
 
11
  import logging
12
  import os
13
  import faiss
14
+ import asyncio
15
 
16
  # Set up logging
17
  logging.basicConfig(level=logging.INFO)
 
19
 
20
  app = FastAPI()
21
 
22
+ # Global lock for model access
23
+ model_lock = asyncio.Lock()
24
+
25
  # Authenticate with Hugging Face
26
  hf_token = os.getenv("HF_TOKEN")
27
  if not hf_token:
 
31
 
32
  # Models Configuration
33
  sentence_transformer_model = "all-MiniLM-L6-v2"
34
+ # Using the 8B model with Q4_K_M quantization
35
  repo_id = "bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF"
36
+ filename = "deepcogito_cogito-v1-preview-llama-8B-Q4_K_M.gguf"
37
 
38
+ # Define FAQs
39
  faqs = [
40
  {"question": "What is your name?", "answer": "My name is Tim Luka Horstmann."},
41
  {"question": "Where do you live?", "answer": "I live in Paris, France."},
 
47
  ]
48
 
49
  try:
50
+ # Load CV embeddings and build FAISS index
51
  logger.info("Loading CV embeddings from cv_embeddings.json")
52
  with open("cv_embeddings.json", "r", encoding="utf-8") as f:
53
  cv_data = json.load(f)
 
76
  local_dir="/app/cache" if os.getenv("HF_HOME") else None,
77
  token=hf_token,
78
  )
79
+ # Use n_batch=256 for lower first-token latency on CPU
80
  generator = Llama(
81
  model_path=model_path,
82
  n_ctx=2048,
83
  n_threads=2,
84
+ n_batch=256, # Reduced from 512 to improve streaming responsiveness
85
  n_gpu_layers=0,
86
  verbose=True,
87
  )
 
106
  with open("cv_text.txt", "r", encoding="utf-8") as f:
107
  full_cv_text = f.read()
108
 
109
+ async def stream_response(query):
110
  logger.info(f"Processing query: {query}")
111
  start_time = time.time()
112
  first_token_logged = False
 
141
  {"role": "user", "content": query}
142
  ]
143
 
144
+ # Acquire lock to ensure exclusive model access
145
+ async with model_lock:
146
+ for chunk in generator.create_chat_completion(
147
+ messages=messages,
148
+ max_tokens=512,
149
+ stream=True,
150
+ temperature=0.3,
151
+ top_p=0.7,
152
+ repeat_penalty=1.2
153
+ ):
154
+ token = chunk['choices'][0]['delta'].get('content', '')
155
+ if token:
156
+ if not first_token_logged:
157
+ logger.info(f"First token time: {time.time() - start_time:.2f}s")
158
+ first_token_logged = True
159
+ yield f"data: {token}\n\n"
160
  yield "data: [DONE]\n\n"
161
 
162
  class QueryRequest(BaseModel):
 
184
  "faiss_index_dim": cv_embeddings.shape[1],
185
  }
186
 
 
187
  @app.on_event("startup")
188
  async def warm_up_model():
189
  logger.info("Warming up the model...")
190
  dummy_query = "Hello"
191
+ async for _ in stream_response(dummy_query):
192
+ pass
193
+ logger.info("Model warm-up completed.")
requirements.txt CHANGED
@@ -5,4 +5,5 @@ torch==2.4.1
5
  numpy==1.26.4
6
  llama-cpp-python==0.3.1
7
  huggingface_hub==0.30.1
8
- faiss-cpu==1.8.0
 
 
5
  numpy==1.26.4
6
  llama-cpp-python==0.3.1
7
  huggingface_hub==0.30.1
8
+ faiss-cpu==1.8.0
9
+ asyncio