Tim Luka Horstmann
commited on
Commit
·
9c89db3
1
Parent(s):
a29c4ff
Improved model and RAG
Browse files- app.py +50 -26
- cv_embeddings.json +0 -0
app.py
CHANGED
@@ -25,8 +25,17 @@ login(token=hf_token)
|
|
25 |
|
26 |
# Models
|
27 |
sentence_transformer_model = "all-MiniLM-L6-v2"
|
28 |
-
repo_id = "bartowski/Llama-3.2-1B-Instruct-GGUF"
|
29 |
-
filename = "Llama-3.2-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
try:
|
32 |
# Load CV embeddings and build FAISS index
|
@@ -45,6 +54,11 @@ try:
|
|
45 |
embedder = SentenceTransformer(sentence_transformer_model, device="cpu")
|
46 |
logger.info("SentenceTransformer model loaded")
|
47 |
|
|
|
|
|
|
|
|
|
|
|
48 |
# Load Llama model
|
49 |
logger.info(f"Loading {filename} model")
|
50 |
model_path = hf_hub_download(
|
@@ -53,11 +67,10 @@ try:
|
|
53 |
local_dir="/app/cache" if os.getenv("HF_HOME") else None,
|
54 |
token=hf_token,
|
55 |
)
|
56 |
-
|
57 |
generator = Llama(
|
58 |
model_path=model_path,
|
59 |
-
n_ctx=1024,
|
60 |
-
n_threads=2,
|
61 |
n_batch=512,
|
62 |
n_gpu_layers=0,
|
63 |
verbose=True,
|
@@ -82,24 +95,35 @@ def retrieve_context(query, top_k=3):
|
|
82 |
def stream_response(query):
|
83 |
try:
|
84 |
logger.info(f"Processing query: {query}")
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
except Exception as e:
|
104 |
logger.error(f"Error in stream_response: {str(e)}")
|
105 |
yield f"data: Error: {str(e)}\n\n"
|
@@ -122,8 +146,8 @@ async def health_check():
|
|
122 |
@app.get("/model_info")
|
123 |
async def model_info():
|
124 |
return {
|
125 |
-
"model_name": "Llama-3.2-
|
126 |
-
"model_size": "
|
127 |
"embedding_model": sentence_transformer_model,
|
128 |
"faiss_index_size": len(cv_chunks),
|
129 |
"faiss_index_dim": cv_embeddings.shape[1],
|
@@ -132,7 +156,7 @@ async def model_info():
|
|
132 |
@app.on_event("startup")
|
133 |
async def warm_up_model():
|
134 |
logger.info("Warming up the model...")
|
135 |
-
dummy_query = "Hi"
|
136 |
for _ in stream_response(dummy_query):
|
137 |
pass
|
138 |
logger.info("Model warm-up complete.")
|
|
|
25 |
|
26 |
# Models
|
27 |
sentence_transformer_model = "all-MiniLM-L6-v2"
|
28 |
+
repo_id = "bartowski/Llama-3.2-3B-Instruct-GGUF" # Switched to 3B; revert to "bartowski/Llama-3.2-1B-Instruct-GGUF" if too heavy
|
29 |
+
filename = "Llama-3.2-3B-Instruct-Q4_K_M.gguf" # Use "Llama-3.2-1B-Instruct-Q4_K_M.gguf" for 1B
|
30 |
+
|
31 |
+
# Define FAQs
|
32 |
+
faqs = [
|
33 |
+
{"question": "What is your name?", "answer": "My name is Tim Luka Horstmann."},
|
34 |
+
{"question": "Where do you live?", "answer": "I live in Paris, France."},
|
35 |
+
{"question": "What is your education?", "answer": "I am currently pursuing a MSc in Data and AI at Institut Polytechnique de Paris. I also hold an MPhil in Advanced Computer Science from the University of Cambridge and a BSc in Business Informatics from RheinMain University of Applied Sciences."},
|
36 |
+
{"question": "What are your skills?", "answer": "I am proficient in Python, Java, SQL, Cypher, SPARQL, VBA, JavaScript, HTML/CSS, and Ruby. I also use tools like PyTorch, Hugging Face, Scikit-Learn, NumPy, Pandas, Matplotlib, Jupyter, Git, Bash, IoT, Ansible, QuickSight, and Wordpress."},
|
37 |
+
# Add more from your CV
|
38 |
+
]
|
39 |
|
40 |
try:
|
41 |
# Load CV embeddings and build FAISS index
|
|
|
54 |
embedder = SentenceTransformer(sentence_transformer_model, device="cpu")
|
55 |
logger.info("SentenceTransformer model loaded")
|
56 |
|
57 |
+
# Compute FAQ embeddings
|
58 |
+
faq_questions = [faq["question"] for faq in faqs]
|
59 |
+
faq_embeddings = embedder.encode(faq_questions, convert_to_numpy=True).astype("float32")
|
60 |
+
faiss.normalize_L2(faq_embeddings)
|
61 |
+
|
62 |
# Load Llama model
|
63 |
logger.info(f"Loading {filename} model")
|
64 |
model_path = hf_hub_download(
|
|
|
67 |
local_dir="/app/cache" if os.getenv("HF_HOME") else None,
|
68 |
token=hf_token,
|
69 |
)
|
|
|
70 |
generator = Llama(
|
71 |
model_path=model_path,
|
72 |
+
n_ctx=1024,
|
73 |
+
n_threads=2,
|
74 |
n_batch=512,
|
75 |
n_gpu_layers=0,
|
76 |
verbose=True,
|
|
|
95 |
def stream_response(query):
|
96 |
try:
|
97 |
logger.info(f"Processing query: {query}")
|
98 |
+
# Check FAQ cache
|
99 |
+
query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
|
100 |
+
query_embedding = query_embedding.reshape(1, -1)
|
101 |
+
faiss.normalize_L2(query_embedding)
|
102 |
+
similarities = np.dot(faq_embeddings, query_embedding.T).flatten()
|
103 |
+
max_sim = np.max(similarities)
|
104 |
+
if max_sim > 0.9:
|
105 |
+
idx = np.argmax(similarities)
|
106 |
+
yield f"data: {faqs[idx]['answer']}\n\n"
|
107 |
+
yield "data: [DONE]\n\n"
|
108 |
+
else:
|
109 |
+
context = retrieve_context(query)
|
110 |
+
prompt = (
|
111 |
+
f"<|im_start|>system\nYou are Tim Luka Horstmann, a Computer Scientist. Here is your CV:\n{context}\n"
|
112 |
+
f"A user is asking you a question about your CV. Respond as yourself, using the first person, and base your answer strictly on the information provided in the CV. Do not invent or assume any details not mentioned.\n<|im_end>\n"
|
113 |
+
f"<|im_start|>user\n{query}\n<|im_end>\n"
|
114 |
+
f"<|im_start|>assistant\n"
|
115 |
+
)
|
116 |
+
for chunk in generator(
|
117 |
+
prompt,
|
118 |
+
max_tokens=512,
|
119 |
+
stream=True,
|
120 |
+
stop=["<|im_end|>", "[DONE]"],
|
121 |
+
temperature=0.5, # Lower for factual responses
|
122 |
+
top_p=0.9,
|
123 |
+
repeat_penalty=1.1, # Reduce repetition/hallucination
|
124 |
+
):
|
125 |
+
yield f"data: {chunk['choices'][0]['text']}\n\n"
|
126 |
+
yield "data: [DONE]\n\n"
|
127 |
except Exception as e:
|
128 |
logger.error(f"Error in stream_response: {str(e)}")
|
129 |
yield f"data: Error: {str(e)}\n\n"
|
|
|
146 |
@app.get("/model_info")
|
147 |
async def model_info():
|
148 |
return {
|
149 |
+
"model_name": "Llama-3.2-3B-Instruct-GGUF",
|
150 |
+
"model_size": "3B",
|
151 |
"embedding_model": sentence_transformer_model,
|
152 |
"faiss_index_size": len(cv_chunks),
|
153 |
"faiss_index_dim": cv_embeddings.shape[1],
|
|
|
156 |
@app.on_event("startup")
|
157 |
async def warm_up_model():
|
158 |
logger.info("Warming up the model...")
|
159 |
+
dummy_query = "Hi"
|
160 |
for _ in stream_response(dummy_query):
|
161 |
pass
|
162 |
logger.info("Model warm-up complete.")
|
cv_embeddings.json
CHANGED
The diff for this file is too large to render.
See raw diff
|
|