Tim Luka Horstmann
commited on
Commit
·
a29c4ff
1
Parent(s):
83ec808
Switched to Llama-3.2-1B Q4_K, added impersonation, optimized performance
Browse files
app.py
CHANGED
@@ -19,31 +19,33 @@ app = FastAPI()
|
|
19 |
# Authenticate with Hugging Face
|
20 |
hf_token = os.getenv("HF_TOKEN")
|
21 |
if not hf_token:
|
22 |
-
logger.error("HF_TOKEN environment variable not set.
|
23 |
raise ValueError("HF_TOKEN not set")
|
24 |
login(token=hf_token)
|
25 |
|
|
|
|
|
|
|
|
|
|
|
26 |
try:
|
27 |
-
# Load
|
28 |
logger.info("Loading CV embeddings from cv_embeddings.json")
|
29 |
with open("cv_embeddings.json", "r", encoding="utf-8") as f:
|
30 |
cv_data = json.load(f)
|
31 |
cv_chunks = [item["chunk"] for item in cv_data]
|
32 |
cv_embeddings = np.array([item["embedding"] for item in cv_data]).astype('float32')
|
33 |
-
faiss.normalize_L2(cv_embeddings)
|
34 |
-
faiss_index = faiss.IndexFlatIP(cv_embeddings.shape[1])
|
35 |
faiss_index.add(cv_embeddings)
|
36 |
logger.info("FAISS index built successfully")
|
37 |
-
logger.info("CV embeddings loaded successfully")
|
38 |
|
39 |
# Load embedding model
|
40 |
logger.info("Loading SentenceTransformer model")
|
41 |
-
embedder = SentenceTransformer(
|
42 |
logger.info("SentenceTransformer model loaded")
|
43 |
|
44 |
-
# Load
|
45 |
-
repo_id = "bartowski/Llama-3.2-3B-Instruct-GGUF" # "bartowski/Qwen2.5-7B-Instruct-GGUF"
|
46 |
-
filename = "Llama-3.2-3B-Instruct-Q6_K_L.gguf" # "Qwen2.5-7B-Instruct-Q4_K_M.gguf"
|
47 |
logger.info(f"Loading {filename} model")
|
48 |
model_path = hf_hub_download(
|
49 |
repo_id=repo_id,
|
@@ -54,11 +56,11 @@ try:
|
|
54 |
|
55 |
generator = Llama(
|
56 |
model_path=model_path,
|
57 |
-
n_ctx=
|
58 |
-
n_threads=
|
59 |
-
n_batch=512,
|
60 |
-
n_gpu_layers=0,
|
61 |
-
verbose=True,
|
62 |
)
|
63 |
logger.info(f"{filename} model loaded")
|
64 |
|
@@ -68,7 +70,6 @@ except Exception as e:
|
|
68 |
|
69 |
def retrieve_context(query, top_k=3):
|
70 |
try:
|
71 |
-
# Encode query and normalize for FAISS
|
72 |
query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
|
73 |
query_embedding = query_embedding.reshape(1, -1)
|
74 |
faiss.normalize_L2(query_embedding)
|
@@ -83,8 +84,9 @@ def stream_response(query):
|
|
83 |
logger.info(f"Processing query: {query}")
|
84 |
context = retrieve_context(query)
|
85 |
prompt = (
|
86 |
-
f"<|im_start|>system\
|
87 |
-
f"<|
|
|
|
88 |
f"<|im_start|>assistant\n"
|
89 |
)
|
90 |
|
@@ -93,8 +95,8 @@ def stream_response(query):
|
|
93 |
max_tokens=512,
|
94 |
stream=True,
|
95 |
stop=["<|im_end|>", "[DONE]"],
|
96 |
-
temperature=0.7,
|
97 |
-
top_p=0.9,
|
98 |
):
|
99 |
yield f"data: {chunk['choices'][0]['text']}\n\n"
|
100 |
yield "data: [DONE]\n\n"
|
@@ -117,10 +119,20 @@ async def predict(request: QueryRequest):
|
|
117 |
async def health_check():
|
118 |
return {"status": "healthy"}
|
119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
@app.on_event("startup")
|
121 |
async def warm_up_model():
|
122 |
logger.info("Warming up the model...")
|
123 |
-
dummy_query = "
|
124 |
for _ in stream_response(dummy_query):
|
125 |
pass
|
126 |
logger.info("Model warm-up complete.")
|
|
|
19 |
# Authenticate with Hugging Face
|
20 |
hf_token = os.getenv("HF_TOKEN")
|
21 |
if not hf_token:
|
22 |
+
logger.error("HF_TOKEN environment variable not set.")
|
23 |
raise ValueError("HF_TOKEN not set")
|
24 |
login(token=hf_token)
|
25 |
|
26 |
+
# Models
|
27 |
+
sentence_transformer_model = "all-MiniLM-L6-v2"
|
28 |
+
repo_id = "bartowski/Llama-3.2-1B-Instruct-GGUF"
|
29 |
+
filename = "Llama-3.2-1B-Instruct-Q4_K_M.gguf" # Q4_K for speed
|
30 |
+
|
31 |
try:
|
32 |
+
# Load CV embeddings and build FAISS index
|
33 |
logger.info("Loading CV embeddings from cv_embeddings.json")
|
34 |
with open("cv_embeddings.json", "r", encoding="utf-8") as f:
|
35 |
cv_data = json.load(f)
|
36 |
cv_chunks = [item["chunk"] for item in cv_data]
|
37 |
cv_embeddings = np.array([item["embedding"] for item in cv_data]).astype('float32')
|
38 |
+
faiss.normalize_L2(cv_embeddings)
|
39 |
+
faiss_index = faiss.IndexFlatIP(cv_embeddings.shape[1])
|
40 |
faiss_index.add(cv_embeddings)
|
41 |
logger.info("FAISS index built successfully")
|
|
|
42 |
|
43 |
# Load embedding model
|
44 |
logger.info("Loading SentenceTransformer model")
|
45 |
+
embedder = SentenceTransformer(sentence_transformer_model, device="cpu")
|
46 |
logger.info("SentenceTransformer model loaded")
|
47 |
|
48 |
+
# Load Llama model
|
|
|
|
|
49 |
logger.info(f"Loading {filename} model")
|
50 |
model_path = hf_hub_download(
|
51 |
repo_id=repo_id,
|
|
|
56 |
|
57 |
generator = Llama(
|
58 |
model_path=model_path,
|
59 |
+
n_ctx=1024, # Reduced for speed
|
60 |
+
n_threads=2, # Match HF Spaces vCPUs
|
61 |
+
n_batch=512,
|
62 |
+
n_gpu_layers=0,
|
63 |
+
verbose=True,
|
64 |
)
|
65 |
logger.info(f"{filename} model loaded")
|
66 |
|
|
|
70 |
|
71 |
def retrieve_context(query, top_k=3):
|
72 |
try:
|
|
|
73 |
query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
|
74 |
query_embedding = query_embedding.reshape(1, -1)
|
75 |
faiss.normalize_L2(query_embedding)
|
|
|
84 |
logger.info(f"Processing query: {query}")
|
85 |
context = retrieve_context(query)
|
86 |
prompt = (
|
87 |
+
f"<|im_start|>system\nI am Tim Luka Horstmann, a German Computer Scientist. This is my CV:\n{context}\n"
|
88 |
+
f"I will answer your questions about my CV as myself. Please ask me anything!\n<|im_end|>\n"
|
89 |
+
f"<|im_start|>user\n{query}\n<|im_end>\n"
|
90 |
f"<|im_start|>assistant\n"
|
91 |
)
|
92 |
|
|
|
95 |
max_tokens=512,
|
96 |
stream=True,
|
97 |
stop=["<|im_end|>", "[DONE]"],
|
98 |
+
temperature=0.7,
|
99 |
+
top_p=0.9,
|
100 |
):
|
101 |
yield f"data: {chunk['choices'][0]['text']}\n\n"
|
102 |
yield "data: [DONE]\n\n"
|
|
|
119 |
async def health_check():
|
120 |
return {"status": "healthy"}
|
121 |
|
122 |
+
@app.get("/model_info")
|
123 |
+
async def model_info():
|
124 |
+
return {
|
125 |
+
"model_name": "Llama-3.2-1B-Instruct-GGUF",
|
126 |
+
"model_size": "1B",
|
127 |
+
"embedding_model": sentence_transformer_model,
|
128 |
+
"faiss_index_size": len(cv_chunks),
|
129 |
+
"faiss_index_dim": cv_embeddings.shape[1],
|
130 |
+
}
|
131 |
+
|
132 |
@app.on_event("startup")
|
133 |
async def warm_up_model():
|
134 |
logger.info("Warming up the model...")
|
135 |
+
dummy_query = "Hi" # Shorter prompt
|
136 |
for _ in stream_response(dummy_query):
|
137 |
pass
|
138 |
logger.info("Model warm-up complete.")
|