Tim Luka Horstmann
commited on
Commit
·
58d2235
1
Parent(s):
dc475e9
increased batch size again
Browse files- app.py +30 -27
- requirements.txt +2 -1
app.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1 |
-
# app.py
|
2 |
-
|
3 |
from datetime import datetime
|
4 |
import json
|
5 |
import time
|
@@ -13,6 +11,7 @@ from huggingface_hub import login, hf_hub_download
|
|
13 |
import logging
|
14 |
import os
|
15 |
import faiss
|
|
|
16 |
|
17 |
# Set up logging
|
18 |
logging.basicConfig(level=logging.INFO)
|
@@ -20,6 +19,9 @@ logger = logging.getLogger(__name__)
|
|
20 |
|
21 |
app = FastAPI()
|
22 |
|
|
|
|
|
|
|
23 |
# Authenticate with Hugging Face
|
24 |
hf_token = os.getenv("HF_TOKEN")
|
25 |
if not hf_token:
|
@@ -29,11 +31,11 @@ login(token=hf_token)
|
|
29 |
|
30 |
# Models Configuration
|
31 |
sentence_transformer_model = "all-MiniLM-L6-v2"
|
32 |
-
#
|
33 |
repo_id = "bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF"
|
34 |
-
filename = "deepcogito_cogito-v1-preview-llama-8B-Q4_K_M.gguf"
|
35 |
|
36 |
-
# Define FAQs
|
37 |
faqs = [
|
38 |
{"question": "What is your name?", "answer": "My name is Tim Luka Horstmann."},
|
39 |
{"question": "Where do you live?", "answer": "I live in Paris, France."},
|
@@ -45,7 +47,7 @@ faqs = [
|
|
45 |
]
|
46 |
|
47 |
try:
|
48 |
-
# Load CV embeddings and build FAISS index
|
49 |
logger.info("Loading CV embeddings from cv_embeddings.json")
|
50 |
with open("cv_embeddings.json", "r", encoding="utf-8") as f:
|
51 |
cv_data = json.load(f)
|
@@ -74,12 +76,12 @@ try:
|
|
74 |
local_dir="/app/cache" if os.getenv("HF_HOME") else None,
|
75 |
token=hf_token,
|
76 |
)
|
77 |
-
#
|
78 |
generator = Llama(
|
79 |
model_path=model_path,
|
80 |
n_ctx=2048,
|
81 |
n_threads=2,
|
82 |
-
n_batch=
|
83 |
n_gpu_layers=0,
|
84 |
verbose=True,
|
85 |
)
|
@@ -104,7 +106,7 @@ def retrieve_context(query, top_k=2):
|
|
104 |
with open("cv_text.txt", "r", encoding="utf-8") as f:
|
105 |
full_cv_text = f.read()
|
106 |
|
107 |
-
def stream_response(query):
|
108 |
logger.info(f"Processing query: {query}")
|
109 |
start_time = time.time()
|
110 |
first_token_logged = False
|
@@ -139,21 +141,22 @@ def stream_response(query):
|
|
139 |
{"role": "user", "content": query}
|
140 |
]
|
141 |
|
142 |
-
#
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
if
|
154 |
-
|
155 |
-
|
156 |
-
|
|
|
157 |
yield "data: [DONE]\n\n"
|
158 |
|
159 |
class QueryRequest(BaseModel):
|
@@ -181,10 +184,10 @@ async def model_info():
|
|
181 |
"faiss_index_dim": cv_embeddings.shape[1],
|
182 |
}
|
183 |
|
184 |
-
# Use a smaller warm-up query to prime the model without extensive delay.
|
185 |
@app.on_event("startup")
|
186 |
async def warm_up_model():
|
187 |
logger.info("Warming up the model...")
|
188 |
dummy_query = "Hello"
|
189 |
-
|
190 |
-
|
|
|
|
|
|
|
|
1 |
from datetime import datetime
|
2 |
import json
|
3 |
import time
|
|
|
11 |
import logging
|
12 |
import os
|
13 |
import faiss
|
14 |
+
import asyncio
|
15 |
|
16 |
# Set up logging
|
17 |
logging.basicConfig(level=logging.INFO)
|
|
|
19 |
|
20 |
app = FastAPI()
|
21 |
|
22 |
+
# Global lock for model access
|
23 |
+
model_lock = asyncio.Lock()
|
24 |
+
|
25 |
# Authenticate with Hugging Face
|
26 |
hf_token = os.getenv("HF_TOKEN")
|
27 |
if not hf_token:
|
|
|
31 |
|
32 |
# Models Configuration
|
33 |
sentence_transformer_model = "all-MiniLM-L6-v2"
|
34 |
+
# Using the 8B model with Q4_K_M quantization
|
35 |
repo_id = "bartowski/deepcogito_cogito-v1-preview-llama-8B-GGUF"
|
36 |
+
filename = "deepcogito_cogito-v1-preview-llama-8B-Q4_K_M.gguf"
|
37 |
|
38 |
+
# Define FAQs
|
39 |
faqs = [
|
40 |
{"question": "What is your name?", "answer": "My name is Tim Luka Horstmann."},
|
41 |
{"question": "Where do you live?", "answer": "I live in Paris, France."},
|
|
|
47 |
]
|
48 |
|
49 |
try:
|
50 |
+
# Load CV embeddings and build FAISS index
|
51 |
logger.info("Loading CV embeddings from cv_embeddings.json")
|
52 |
with open("cv_embeddings.json", "r", encoding="utf-8") as f:
|
53 |
cv_data = json.load(f)
|
|
|
76 |
local_dir="/app/cache" if os.getenv("HF_HOME") else None,
|
77 |
token=hf_token,
|
78 |
)
|
79 |
+
# Use n_batch=256 for lower first-token latency on CPU
|
80 |
generator = Llama(
|
81 |
model_path=model_path,
|
82 |
n_ctx=2048,
|
83 |
n_threads=2,
|
84 |
+
n_batch=256, # Reduced from 512 to improve streaming responsiveness
|
85 |
n_gpu_layers=0,
|
86 |
verbose=True,
|
87 |
)
|
|
|
106 |
with open("cv_text.txt", "r", encoding="utf-8") as f:
|
107 |
full_cv_text = f.read()
|
108 |
|
109 |
+
async def stream_response(query):
|
110 |
logger.info(f"Processing query: {query}")
|
111 |
start_time = time.time()
|
112 |
first_token_logged = False
|
|
|
141 |
{"role": "user", "content": query}
|
142 |
]
|
143 |
|
144 |
+
# Acquire lock to ensure exclusive model access
|
145 |
+
async with model_lock:
|
146 |
+
for chunk in generator.create_chat_completion(
|
147 |
+
messages=messages,
|
148 |
+
max_tokens=512,
|
149 |
+
stream=True,
|
150 |
+
temperature=0.3,
|
151 |
+
top_p=0.7,
|
152 |
+
repeat_penalty=1.2
|
153 |
+
):
|
154 |
+
token = chunk['choices'][0]['delta'].get('content', '')
|
155 |
+
if token:
|
156 |
+
if not first_token_logged:
|
157 |
+
logger.info(f"First token time: {time.time() - start_time:.2f}s")
|
158 |
+
first_token_logged = True
|
159 |
+
yield f"data: {token}\n\n"
|
160 |
yield "data: [DONE]\n\n"
|
161 |
|
162 |
class QueryRequest(BaseModel):
|
|
|
184 |
"faiss_index_dim": cv_embeddings.shape[1],
|
185 |
}
|
186 |
|
|
|
187 |
@app.on_event("startup")
|
188 |
async def warm_up_model():
|
189 |
logger.info("Warming up the model...")
|
190 |
dummy_query = "Hello"
|
191 |
+
async for _ in stream_response(dummy_query):
|
192 |
+
pass
|
193 |
+
logger.info("Model warm-up completed.")
|
requirements.txt
CHANGED
@@ -5,4 +5,5 @@ torch==2.4.1
|
|
5 |
numpy==1.26.4
|
6 |
llama-cpp-python==0.3.1
|
7 |
huggingface_hub==0.30.1
|
8 |
-
faiss-cpu==1.8.0
|
|
|
|
5 |
numpy==1.26.4
|
6 |
llama-cpp-python==0.3.1
|
7 |
huggingface_hub==0.30.1
|
8 |
+
faiss-cpu==1.8.0
|
9 |
+
asyncio
|