Spaces:

Luka512
/

website

Running

App Files Files Community

Tim Luka Horstmann commited on Apr 9

Commit

293413b

1 Parent(s): 48a65b5

Updated backend with chat completion

Browse files

Files changed (1) hide show

app.py +47 -45

app.py CHANGED Viewed

@@ -1,4 +1,7 @@
 import json
 import numpy as np
 from sentence_transformers import SentenceTransformer
 from fastapi import FastAPI, HTTPException
@@ -71,7 +74,7 @@ try:
     )
     generator = Llama(
         model_path=model_path,
-        n_ctx=2048,  # Adjust if 128k is supported and memory allows; start with 1024
         n_threads=2,
         n_batch=512,
         n_gpu_layers=0,  # No GPU on free tier
@@ -83,7 +86,7 @@ except Exception as e:
     logger.error(f"Startup error: {str(e)}", exc_info=True)
     raise
-def retrieve_context(query, top_k=3):
     try:
         query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
         query_embedding = query_embedding.reshape(1, -1)
@@ -95,50 +98,49 @@ def retrieve_context(query, top_k=3):
         raise
 def stream_response(query):
-    try:
-        logger.info(f"Processing query: {query}")
-        # Check FAQ cache (unchanged)
-        query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
-        query_embedding = query_embedding.reshape(1, -1)
-        faiss.normalize_L2(query_embedding)
-        similarities = np.dot(faq_embeddings, query_embedding.T).flatten()
-        max_sim = np.max(similarities)
-        if max_sim > 0.9:
-            idx = np.argmax(similarities)
-            yield f"data: {faqs[idx]['answer']}\n\n"
-            yield "data: [DONE]\n\n"
-            return
-        context = retrieve_context(query)
-        prompt = (
-            f"<|begin_of_text|><|start_header_id|>system<|end_header_id>\n"
-            f"You are Tim Luka Horstmann, a Computer Scientist. Here is your CV:\n{context}\n"
-            f"A user is asking you a question. Respond as yourself, using the first person, in a friendly and concise manner. For questions about your CV, base your answer strictly on the provided CV information. For casual questions not covered by the CV, respond naturally but do not invent specific details beyond what’s generally true about you (e.g., your current location or field of work). Avoid meta-commentary or critiquing your own response.\n"
-            f"<|eot_id|><|start_header_id|>user<|end_header_id>\n"
-            f"{query}\n"
-            f"<|eot_id|><|start_header_id|>assistant<|end_header_id>\n"
-        )
-        response_text = ""
-        for chunk in generator(
-            prompt,
-            max_tokens=200,
-            stream=True,
-            stop=["<|eot_id|>", "[DONE]"],  # Updated stop tokens
-            temperature=0.5,
-            top_p=0.9,
-            repeat_penalty=1.2,
-        ):
-            text = chunk['choices'][0]['text']
-            response_text += text
-            yield f"data: {text}\n\n"
-            if "<|eot_id>" in response_text or "[DONE]" in response_text:
-                break
-        yield "data: [DONE]\n\n"
-    except Exception as e:
-        logger.error(f"Error in stream_response: {str(e)}")
-        yield f"data: Error: {str(e)}\n\n"
         yield "data: [DONE]\n\n"
 class QueryRequest(BaseModel):
     data: list

+# app.py
 import json
+import time
 import numpy as np
 from sentence_transformers import SentenceTransformer
 from fastapi import FastAPI, HTTPException
     )
     generator = Llama(
         model_path=model_path,
+        n_ctx=1024,  # Adjust if 128k is supported and memory allows; start with 1024
         n_threads=2,
         n_batch=512,
         n_gpu_layers=0,  # No GPU on free tier
     logger.error(f"Startup error: {str(e)}", exc_info=True)
     raise
+def retrieve_context(query, top_k=2):
     try:
         query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
         query_embedding = query_embedding.reshape(1, -1)
         raise
 def stream_response(query):
+    logger.info(f"Processing query: {query}")
+    start_time = time.time()
+    # FAQ check first
+    query_embedding = embedder.encode(query, convert_to_numpy=True).astype("float32")
+    query_embedding = query_embedding.reshape(1, -1)
+    faiss.normalize_L2(query_embedding)
+    similarities = np.dot(faq_embeddings, query_embedding.T).flatten()
+    max_sim = np.max(similarities)
+    if max_sim > 0.9:
+        idx = np.argmax(similarities)
+        yield f"data: {faqs[idx]['answer']}\n\n"
         yield "data: [DONE]\n\n"
+        return
+    yield "data: I'm thinking...\n\n"
+    context = retrieve_context(query, top_k=2)
+    messages = [
+        {"role": "system", "content": f"You are Tim Luka Horstmann, a Computer Scientist. A user is asking you a question. Respond as yourself, using the first person, in a friendly and concise manner. For questions about your CV, base your answer strictly on the provided CV information. For casual questions not covered by the CV, respond naturally but do not invent specific details beyond what’s generally true about you (e.g., your current location or field of work). Avoid meta-commentary or critiquing your own response. CV: {context}"},
+        {"role": "user", "content": query}
+    ]
+    buffer = ""
+    for chunk in generator.create_chat_completion(
+        messages=messages,
+        max_tokens=512,
+        stream=True,
+        temperature=0.5,
+        top_p=0.9,
+        repeat_penalty=1.2
+    ):
+        text = chunk['choices'][0]['delta'].get('content', '')
+        if text:
+            buffer += text
+            if buffer.endswith(" ") or buffer.endswith(".") or buffer.endswith("!"):
+                yield f"data: {buffer}\n\n"
+                buffer = ""
+        if time.time() - start_time > 1:  # Log first token
+            logger.info(f"First token time: {time.time() - start_time:.2f}s")
+            break
+    if buffer:
+        yield f"data: {buffer}\n\n"
+    yield "data: [DONE]\n\n"
 class QueryRequest(BaseModel):
     data: list