Tim Luka Horstmann commited on
Commit
293413b
·
1 Parent(s): 48a65b5

Updated backend with chat completion

Browse files
Files changed (1) hide show
  1. app.py +47 -45
app.py CHANGED
@@ -1,4 +1,7 @@
 
 
1
  import json
 
2
  import numpy as np
3
  from sentence_transformers import SentenceTransformer
4
  from fastapi import FastAPI, HTTPException
@@ -71,7 +74,7 @@ try:
71
  )
72
  generator = Llama(
73
  model_path=model_path,
74
- n_ctx=2048, # Adjust if 128k is supported and memory allows; start with 1024
75
  n_threads=2,
76
  n_batch=512,
77
  n_gpu_layers=0, # No GPU on free tier
@@ -83,7 +86,7 @@ except Exception as e:
83
  logger.error(f"Startup error: {str(e)}", exc_info=True)
84
  raise
85
 
86
- def retrieve_context(query, top_k=3):
87
  try:
88
  query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
89
  query_embedding = query_embedding.reshape(1, -1)
@@ -95,50 +98,49 @@ def retrieve_context(query, top_k=3):
95
  raise
96
 
97
  def stream_response(query):
98
- try:
99
- logger.info(f"Processing query: {query}")
100
- # Check FAQ cache (unchanged)
101
- query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
102
- query_embedding = query_embedding.reshape(1, -1)
103
- faiss.normalize_L2(query_embedding)
104
- similarities = np.dot(faq_embeddings, query_embedding.T).flatten()
105
- max_sim = np.max(similarities)
106
- if max_sim > 0.9:
107
- idx = np.argmax(similarities)
108
- yield f"data: {faqs[idx]['answer']}\n\n"
109
- yield "data: [DONE]\n\n"
110
- return
111
-
112
- context = retrieve_context(query)
113
- prompt = (
114
- f"<|begin_of_text|><|start_header_id|>system<|end_header_id>\n"
115
- f"You are Tim Luka Horstmann, a Computer Scientist. Here is your CV:\n{context}\n"
116
- f"A user is asking you a question. Respond as yourself, using the first person, in a friendly and concise manner. For questions about your CV, base your answer strictly on the provided CV information. For casual questions not covered by the CV, respond naturally but do not invent specific details beyond what’s generally true about you (e.g., your current location or field of work). Avoid meta-commentary or critiquing your own response.\n"
117
- f"<|eot_id|><|start_header_id|>user<|end_header_id>\n"
118
- f"{query}\n"
119
- f"<|eot_id|><|start_header_id|>assistant<|end_header_id>\n"
120
- )
121
-
122
- response_text = ""
123
- for chunk in generator(
124
- prompt,
125
- max_tokens=200,
126
- stream=True,
127
- stop=["<|eot_id|>", "[DONE]"], # Updated stop tokens
128
- temperature=0.5,
129
- top_p=0.9,
130
- repeat_penalty=1.2,
131
- ):
132
- text = chunk['choices'][0]['text']
133
- response_text += text
134
- yield f"data: {text}\n\n"
135
- if "<|eot_id>" in response_text or "[DONE]" in response_text:
136
- break
137
- yield "data: [DONE]\n\n"
138
- except Exception as e:
139
- logger.error(f"Error in stream_response: {str(e)}")
140
- yield f"data: Error: {str(e)}\n\n"
141
  yield "data: [DONE]\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  class QueryRequest(BaseModel):
144
  data: list
 
1
+ # app.py
2
+
3
  import json
4
+ import time
5
  import numpy as np
6
  from sentence_transformers import SentenceTransformer
7
  from fastapi import FastAPI, HTTPException
 
74
  )
75
  generator = Llama(
76
  model_path=model_path,
77
+ n_ctx=1024, # Adjust if 128k is supported and memory allows; start with 1024
78
  n_threads=2,
79
  n_batch=512,
80
  n_gpu_layers=0, # No GPU on free tier
 
86
  logger.error(f"Startup error: {str(e)}", exc_info=True)
87
  raise
88
 
89
+ def retrieve_context(query, top_k=2):
90
  try:
91
  query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
92
  query_embedding = query_embedding.reshape(1, -1)
 
98
  raise
99
 
100
  def stream_response(query):
101
+ logger.info(f"Processing query: {query}")
102
+ start_time = time.time()
103
+
104
+ # FAQ check first
105
+ query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
106
+ query_embedding = query_embedding.reshape(1, -1)
107
+ faiss.normalize_L2(query_embedding)
108
+ similarities = np.dot(faq_embeddings, query_embedding.T).flatten()
109
+ max_sim = np.max(similarities)
110
+ if max_sim > 0.9:
111
+ idx = np.argmax(similarities)
112
+ yield f"data: {faqs[idx]['answer']}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  yield "data: [DONE]\n\n"
114
+ return
115
+
116
+ yield "data: I'm thinking...\n\n"
117
+ context = retrieve_context(query, top_k=2)
118
+ messages = [
119
+ {"role": "system", "content": f"You are Tim Luka Horstmann, a Computer Scientist. A user is asking you a question. Respond as yourself, using the first person, in a friendly and concise manner. For questions about your CV, base your answer strictly on the provided CV information. For casual questions not covered by the CV, respond naturally but do not invent specific details beyond what’s generally true about you (e.g., your current location or field of work). Avoid meta-commentary or critiquing your own response. CV: {context}"},
120
+ {"role": "user", "content": query}
121
+ ]
122
+
123
+ buffer = ""
124
+ for chunk in generator.create_chat_completion(
125
+ messages=messages,
126
+ max_tokens=512,
127
+ stream=True,
128
+ temperature=0.5,
129
+ top_p=0.9,
130
+ repeat_penalty=1.2
131
+ ):
132
+ text = chunk['choices'][0]['delta'].get('content', '')
133
+ if text:
134
+ buffer += text
135
+ if buffer.endswith(" ") or buffer.endswith(".") or buffer.endswith("!"):
136
+ yield f"data: {buffer}\n\n"
137
+ buffer = ""
138
+ if time.time() - start_time > 1: # Log first token
139
+ logger.info(f"First token time: {time.time() - start_time:.2f}s")
140
+ break
141
+ if buffer:
142
+ yield f"data: {buffer}\n\n"
143
+ yield "data: [DONE]\n\n"
144
 
145
  class QueryRequest(BaseModel):
146
  data: list