Tim Luka Horstmann commited on
Commit
9c89db3
·
1 Parent(s): a29c4ff

Improved model and RAG

Browse files
Files changed (2) hide show
  1. app.py +50 -26
  2. cv_embeddings.json +0 -0
app.py CHANGED
@@ -25,8 +25,17 @@ login(token=hf_token)
25
 
26
  # Models
27
  sentence_transformer_model = "all-MiniLM-L6-v2"
28
- repo_id = "bartowski/Llama-3.2-1B-Instruct-GGUF"
29
- filename = "Llama-3.2-1B-Instruct-Q4_K_M.gguf" # Q4_K for speed
 
 
 
 
 
 
 
 
 
30
 
31
  try:
32
  # Load CV embeddings and build FAISS index
@@ -45,6 +54,11 @@ try:
45
  embedder = SentenceTransformer(sentence_transformer_model, device="cpu")
46
  logger.info("SentenceTransformer model loaded")
47
 
 
 
 
 
 
48
  # Load Llama model
49
  logger.info(f"Loading {filename} model")
50
  model_path = hf_hub_download(
@@ -53,11 +67,10 @@ try:
53
  local_dir="/app/cache" if os.getenv("HF_HOME") else None,
54
  token=hf_token,
55
  )
56
-
57
  generator = Llama(
58
  model_path=model_path,
59
- n_ctx=1024, # Reduced for speed
60
- n_threads=2, # Match HF Spaces vCPUs
61
  n_batch=512,
62
  n_gpu_layers=0,
63
  verbose=True,
@@ -82,24 +95,35 @@ def retrieve_context(query, top_k=3):
82
  def stream_response(query):
83
  try:
84
  logger.info(f"Processing query: {query}")
85
- context = retrieve_context(query)
86
- prompt = (
87
- f"<|im_start|>system\nI am Tim Luka Horstmann, a German Computer Scientist. This is my CV:\n{context}\n"
88
- f"I will answer your questions about my CV as myself. Please ask me anything!\n<|im_end|>\n"
89
- f"<|im_start|>user\n{query}\n<|im_end>\n"
90
- f"<|im_start|>assistant\n"
91
- )
92
-
93
- for chunk in generator(
94
- prompt,
95
- max_tokens=512,
96
- stream=True,
97
- stop=["<|im_end|>", "[DONE]"],
98
- temperature=0.7,
99
- top_p=0.9,
100
- ):
101
- yield f"data: {chunk['choices'][0]['text']}\n\n"
102
- yield "data: [DONE]\n\n"
 
 
 
 
 
 
 
 
 
 
 
103
  except Exception as e:
104
  logger.error(f"Error in stream_response: {str(e)}")
105
  yield f"data: Error: {str(e)}\n\n"
@@ -122,8 +146,8 @@ async def health_check():
122
  @app.get("/model_info")
123
  async def model_info():
124
  return {
125
- "model_name": "Llama-3.2-1B-Instruct-GGUF",
126
- "model_size": "1B",
127
  "embedding_model": sentence_transformer_model,
128
  "faiss_index_size": len(cv_chunks),
129
  "faiss_index_dim": cv_embeddings.shape[1],
@@ -132,7 +156,7 @@ async def model_info():
132
  @app.on_event("startup")
133
  async def warm_up_model():
134
  logger.info("Warming up the model...")
135
- dummy_query = "Hi" # Shorter prompt
136
  for _ in stream_response(dummy_query):
137
  pass
138
  logger.info("Model warm-up complete.")
 
25
 
26
  # Models
27
  sentence_transformer_model = "all-MiniLM-L6-v2"
28
+ repo_id = "bartowski/Llama-3.2-3B-Instruct-GGUF" # Switched to 3B; revert to "bartowski/Llama-3.2-1B-Instruct-GGUF" if too heavy
29
+ filename = "Llama-3.2-3B-Instruct-Q4_K_M.gguf" # Use "Llama-3.2-1B-Instruct-Q4_K_M.gguf" for 1B
30
+
31
+ # Define FAQs
32
+ faqs = [
33
+ {"question": "What is your name?", "answer": "My name is Tim Luka Horstmann."},
34
+ {"question": "Where do you live?", "answer": "I live in Paris, France."},
35
+ {"question": "What is your education?", "answer": "I am currently pursuing a MSc in Data and AI at Institut Polytechnique de Paris. I also hold an MPhil in Advanced Computer Science from the University of Cambridge and a BSc in Business Informatics from RheinMain University of Applied Sciences."},
36
+ {"question": "What are your skills?", "answer": "I am proficient in Python, Java, SQL, Cypher, SPARQL, VBA, JavaScript, HTML/CSS, and Ruby. I also use tools like PyTorch, Hugging Face, Scikit-Learn, NumPy, Pandas, Matplotlib, Jupyter, Git, Bash, IoT, Ansible, QuickSight, and Wordpress."},
37
+ # Add more from your CV
38
+ ]
39
 
40
  try:
41
  # Load CV embeddings and build FAISS index
 
54
  embedder = SentenceTransformer(sentence_transformer_model, device="cpu")
55
  logger.info("SentenceTransformer model loaded")
56
 
57
+ # Compute FAQ embeddings
58
+ faq_questions = [faq["question"] for faq in faqs]
59
+ faq_embeddings = embedder.encode(faq_questions, convert_to_numpy=True).astype("float32")
60
+ faiss.normalize_L2(faq_embeddings)
61
+
62
  # Load Llama model
63
  logger.info(f"Loading {filename} model")
64
  model_path = hf_hub_download(
 
67
  local_dir="/app/cache" if os.getenv("HF_HOME") else None,
68
  token=hf_token,
69
  )
 
70
  generator = Llama(
71
  model_path=model_path,
72
+ n_ctx=1024,
73
+ n_threads=2,
74
  n_batch=512,
75
  n_gpu_layers=0,
76
  verbose=True,
 
95
  def stream_response(query):
96
  try:
97
  logger.info(f"Processing query: {query}")
98
+ # Check FAQ cache
99
+ query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
100
+ query_embedding = query_embedding.reshape(1, -1)
101
+ faiss.normalize_L2(query_embedding)
102
+ similarities = np.dot(faq_embeddings, query_embedding.T).flatten()
103
+ max_sim = np.max(similarities)
104
+ if max_sim > 0.9:
105
+ idx = np.argmax(similarities)
106
+ yield f"data: {faqs[idx]['answer']}\n\n"
107
+ yield "data: [DONE]\n\n"
108
+ else:
109
+ context = retrieve_context(query)
110
+ prompt = (
111
+ f"<|im_start|>system\nYou are Tim Luka Horstmann, a Computer Scientist. Here is your CV:\n{context}\n"
112
+ f"A user is asking you a question about your CV. Respond as yourself, using the first person, and base your answer strictly on the information provided in the CV. Do not invent or assume any details not mentioned.\n<|im_end>\n"
113
+ f"<|im_start|>user\n{query}\n<|im_end>\n"
114
+ f"<|im_start|>assistant\n"
115
+ )
116
+ for chunk in generator(
117
+ prompt,
118
+ max_tokens=512,
119
+ stream=True,
120
+ stop=["<|im_end|>", "[DONE]"],
121
+ temperature=0.5, # Lower for factual responses
122
+ top_p=0.9,
123
+ repeat_penalty=1.1, # Reduce repetition/hallucination
124
+ ):
125
+ yield f"data: {chunk['choices'][0]['text']}\n\n"
126
+ yield "data: [DONE]\n\n"
127
  except Exception as e:
128
  logger.error(f"Error in stream_response: {str(e)}")
129
  yield f"data: Error: {str(e)}\n\n"
 
146
  @app.get("/model_info")
147
  async def model_info():
148
  return {
149
+ "model_name": "Llama-3.2-3B-Instruct-GGUF",
150
+ "model_size": "3B",
151
  "embedding_model": sentence_transformer_model,
152
  "faiss_index_size": len(cv_chunks),
153
  "faiss_index_dim": cv_embeddings.shape[1],
 
156
  @app.on_event("startup")
157
  async def warm_up_model():
158
  logger.info("Warming up the model...")
159
+ dummy_query = "Hi"
160
  for _ in stream_response(dummy_query):
161
  pass
162
  logger.info("Model warm-up complete.")
cv_embeddings.json CHANGED
The diff for this file is too large to render. See raw diff