Tim Luka Horstmann
commited on
Commit
·
293413b
1
Parent(s):
48a65b5
Updated backend with chat completion
Browse files
app.py
CHANGED
@@ -1,4 +1,7 @@
|
|
|
|
|
|
1 |
import json
|
|
|
2 |
import numpy as np
|
3 |
from sentence_transformers import SentenceTransformer
|
4 |
from fastapi import FastAPI, HTTPException
|
@@ -71,7 +74,7 @@ try:
|
|
71 |
)
|
72 |
generator = Llama(
|
73 |
model_path=model_path,
|
74 |
-
n_ctx=
|
75 |
n_threads=2,
|
76 |
n_batch=512,
|
77 |
n_gpu_layers=0, # No GPU on free tier
|
@@ -83,7 +86,7 @@ except Exception as e:
|
|
83 |
logger.error(f"Startup error: {str(e)}", exc_info=True)
|
84 |
raise
|
85 |
|
86 |
-
def retrieve_context(query, top_k=
|
87 |
try:
|
88 |
query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
|
89 |
query_embedding = query_embedding.reshape(1, -1)
|
@@ -95,50 +98,49 @@ def retrieve_context(query, top_k=3):
|
|
95 |
raise
|
96 |
|
97 |
def stream_response(query):
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
return
|
111 |
-
|
112 |
-
context = retrieve_context(query)
|
113 |
-
prompt = (
|
114 |
-
f"<|begin_of_text|><|start_header_id|>system<|end_header_id>\n"
|
115 |
-
f"You are Tim Luka Horstmann, a Computer Scientist. Here is your CV:\n{context}\n"
|
116 |
-
f"A user is asking you a question. Respond as yourself, using the first person, in a friendly and concise manner. For questions about your CV, base your answer strictly on the provided CV information. For casual questions not covered by the CV, respond naturally but do not invent specific details beyond what’s generally true about you (e.g., your current location or field of work). Avoid meta-commentary or critiquing your own response.\n"
|
117 |
-
f"<|eot_id|><|start_header_id|>user<|end_header_id>\n"
|
118 |
-
f"{query}\n"
|
119 |
-
f"<|eot_id|><|start_header_id|>assistant<|end_header_id>\n"
|
120 |
-
)
|
121 |
-
|
122 |
-
response_text = ""
|
123 |
-
for chunk in generator(
|
124 |
-
prompt,
|
125 |
-
max_tokens=200,
|
126 |
-
stream=True,
|
127 |
-
stop=["<|eot_id|>", "[DONE]"], # Updated stop tokens
|
128 |
-
temperature=0.5,
|
129 |
-
top_p=0.9,
|
130 |
-
repeat_penalty=1.2,
|
131 |
-
):
|
132 |
-
text = chunk['choices'][0]['text']
|
133 |
-
response_text += text
|
134 |
-
yield f"data: {text}\n\n"
|
135 |
-
if "<|eot_id>" in response_text or "[DONE]" in response_text:
|
136 |
-
break
|
137 |
-
yield "data: [DONE]\n\n"
|
138 |
-
except Exception as e:
|
139 |
-
logger.error(f"Error in stream_response: {str(e)}")
|
140 |
-
yield f"data: Error: {str(e)}\n\n"
|
141 |
yield "data: [DONE]\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
class QueryRequest(BaseModel):
|
144 |
data: list
|
|
|
1 |
+
# app.py
|
2 |
+
|
3 |
import json
|
4 |
+
import time
|
5 |
import numpy as np
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
from fastapi import FastAPI, HTTPException
|
|
|
74 |
)
|
75 |
generator = Llama(
|
76 |
model_path=model_path,
|
77 |
+
n_ctx=1024, # Adjust if 128k is supported and memory allows; start with 1024
|
78 |
n_threads=2,
|
79 |
n_batch=512,
|
80 |
n_gpu_layers=0, # No GPU on free tier
|
|
|
86 |
logger.error(f"Startup error: {str(e)}", exc_info=True)
|
87 |
raise
|
88 |
|
89 |
+
def retrieve_context(query, top_k=2):
|
90 |
try:
|
91 |
query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
|
92 |
query_embedding = query_embedding.reshape(1, -1)
|
|
|
98 |
raise
|
99 |
|
100 |
def stream_response(query):
|
101 |
+
logger.info(f"Processing query: {query}")
|
102 |
+
start_time = time.time()
|
103 |
+
|
104 |
+
# FAQ check first
|
105 |
+
query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
|
106 |
+
query_embedding = query_embedding.reshape(1, -1)
|
107 |
+
faiss.normalize_L2(query_embedding)
|
108 |
+
similarities = np.dot(faq_embeddings, query_embedding.T).flatten()
|
109 |
+
max_sim = np.max(similarities)
|
110 |
+
if max_sim > 0.9:
|
111 |
+
idx = np.argmax(similarities)
|
112 |
+
yield f"data: {faqs[idx]['answer']}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
yield "data: [DONE]\n\n"
|
114 |
+
return
|
115 |
+
|
116 |
+
yield "data: I'm thinking...\n\n"
|
117 |
+
context = retrieve_context(query, top_k=2)
|
118 |
+
messages = [
|
119 |
+
{"role": "system", "content": f"You are Tim Luka Horstmann, a Computer Scientist. A user is asking you a question. Respond as yourself, using the first person, in a friendly and concise manner. For questions about your CV, base your answer strictly on the provided CV information. For casual questions not covered by the CV, respond naturally but do not invent specific details beyond what’s generally true about you (e.g., your current location or field of work). Avoid meta-commentary or critiquing your own response. CV: {context}"},
|
120 |
+
{"role": "user", "content": query}
|
121 |
+
]
|
122 |
+
|
123 |
+
buffer = ""
|
124 |
+
for chunk in generator.create_chat_completion(
|
125 |
+
messages=messages,
|
126 |
+
max_tokens=512,
|
127 |
+
stream=True,
|
128 |
+
temperature=0.5,
|
129 |
+
top_p=0.9,
|
130 |
+
repeat_penalty=1.2
|
131 |
+
):
|
132 |
+
text = chunk['choices'][0]['delta'].get('content', '')
|
133 |
+
if text:
|
134 |
+
buffer += text
|
135 |
+
if buffer.endswith(" ") or buffer.endswith(".") or buffer.endswith("!"):
|
136 |
+
yield f"data: {buffer}\n\n"
|
137 |
+
buffer = ""
|
138 |
+
if time.time() - start_time > 1: # Log first token
|
139 |
+
logger.info(f"First token time: {time.time() - start_time:.2f}s")
|
140 |
+
break
|
141 |
+
if buffer:
|
142 |
+
yield f"data: {buffer}\n\n"
|
143 |
+
yield "data: [DONE]\n\n"
|
144 |
|
145 |
class QueryRequest(BaseModel):
|
146 |
data: list
|